mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 07:53:55 +08:00
[TRTLLM-10752][chore] set default val of max_num_tokens_in_buffer as max_seq_len or max_input_len (#11082)
Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
This commit is contained in:
parent
a7494a5ff4
commit
a9d4927235
@ -449,6 +449,10 @@ def create_py_executor(
|
||||
max_num_tokens = model_engine.max_num_tokens
|
||||
sparse_attention_config = model_engine.sparse_attention_config
|
||||
|
||||
# Set default value for cache_transceiver_config.max_tokens_in_buffer
|
||||
if cache_transceiver_config and cache_transceiver_config.max_tokens_in_buffer is None:
|
||||
cache_transceiver_config.max_tokens_in_buffer = net_max_seq_len
|
||||
|
||||
config = model_engine.model.model_config.pretrained_config
|
||||
if is_mla(config):
|
||||
if model_engine.model.model_config.enable_flash_mla:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user