[https://nvbugs/5525849][fix] Cherry-pick to fix mismatch of max seq len between kv cache manager and dummy requests (#7855)

Signed-off-by: Hui Gao <huig@nvidia.com>
This commit is contained in:
HuiGao-NV 2025-09-22 18:07:47 +08:00 committed by GitHub
parent 8cf95681e6
commit 0dac1ddb74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -73,8 +73,8 @@ class KvCacheCreator:
self._tokens_per_block = tokens_per_block
self._max_seq_len = max_seq_len
self._max_batch_size = max_batch_size
self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
1)
self._net_max_seq_len = net_max_seq_len
self._dummy_reqs = None
@staticmethod
def _get_cache_size_per_token(model_config: ModelConfig,
@ -199,7 +199,7 @@ class KvCacheCreator:
if self._dummy_reqs is None:
self._dummy_reqs = self._create_dummy_context_requests(
max(1, self.net_max_seq_len - 1))
max(1, self._net_max_seq_len - 1))
for req in self._dummy_reqs:
num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq
# Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
@ -482,7 +482,14 @@ class KvCacheCreator:
)
# KVCacheManager (Non-draft) modifies the max_seq_len field, update it to self
if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:
self._max_seq_len = kv_cache_manager.max_seq_len
# When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
if kv_cache_manager is not None:
if kv_cache_manager.max_seq_len < self._max_seq_len:
self._dummy_reqs = self._create_dummy_context_requests(
max(
1, self._net_max_seq_len - 1 -
(self._max_seq_len - kv_cache_manager.max_seq_len)))
self._max_seq_len = kv_cache_manager.max_seq_len
# When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
if kv_cache_manager is not None: