From 0dac1ddb746bf879f29764b370591c4163c6cfe4 Mon Sep 17 00:00:00 2001 From: HuiGao-NV Date: Mon, 22 Sep 2025 18:07:47 +0800 Subject: [PATCH] [https://nvbugs/5525849][fix] Cherry-pick to fix mismatch of max seq len between kv cache manager and dummy requests (#7855) Signed-off-by: Hui Gao --- tensorrt_llm/_torch/pyexecutor/_util.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index e8d68a5938..60fe2cecb9 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -73,8 +73,8 @@ class KvCacheCreator: self._tokens_per_block = tokens_per_block self._max_seq_len = max_seq_len self._max_batch_size = max_batch_size - self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len - - 1) + self._net_max_seq_len = net_max_seq_len + self._dummy_reqs = None @staticmethod def _get_cache_size_per_token(model_config: ModelConfig, @@ -199,7 +199,7 @@ class KvCacheCreator: if self._dummy_reqs is None: self._dummy_reqs = self._create_dummy_context_requests( - max(1, self.net_max_seq_len - 1)) + max(1, self._net_max_seq_len - 1)) for req in self._dummy_reqs: num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq # Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size. @@ -482,7 +482,14 @@ class KvCacheCreator: ) # KVCacheManager (Non-draft) modifies the max_seq_len field, update it to self if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER: - self._max_seq_len = kv_cache_manager.max_seq_len + # When SWA is enabled, max_seq_len is updated inside kv_cache_manager. + if kv_cache_manager is not None: + if kv_cache_manager.max_seq_len < self._max_seq_len: + self._dummy_reqs = self._create_dummy_context_requests( + max( + 1, self._net_max_seq_len - 1 - + (self._max_seq_len - kv_cache_manager.max_seq_len))) + self._max_seq_len = kv_cache_manager.max_seq_len # When SWA is enabled, max_seq_len is updated inside kv_cache_manager. if kv_cache_manager is not None: