From a9d49272356f26ce643a444996ff8bf22e087371 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Fri, 6 Feb 2026 03:54:00 +0800
Subject: [PATCH] [TRTLLM-10752][chore] set default val of
 max_num_tokens_in_buffer as max_seq_len or max_input_len (#11082)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor_creator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index ce6da4b02c..6dafff2978 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -449,6 +449,10 @@ def create_py_executor(
     max_num_tokens = model_engine.max_num_tokens
     sparse_attention_config = model_engine.sparse_attention_config
 
+    # Set default value for cache_transceiver_config.max_tokens_in_buffer
+    if cache_transceiver_config and cache_transceiver_config.max_tokens_in_buffer is None:
+        cache_transceiver_config.max_tokens_in_buffer = net_max_seq_len
+
     config = model_engine.model.model_config.pretrained_config
     if is_mla(config):
         if model_engine.model.model_config.enable_flash_mla: