mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-07 11:41:47 +08:00
[https://nvbugs/5525849][fix] Cherry-pick to fix mismatch of max seq len between kv cache manager and dummy requests (#7855)
Signed-off-by: Hui Gao <huig@nvidia.com>
This commit is contained in:
parent
8cf95681e6
commit
0dac1ddb74
@ -73,8 +73,8 @@ class KvCacheCreator:
|
||||
self._tokens_per_block = tokens_per_block
|
||||
self._max_seq_len = max_seq_len
|
||||
self._max_batch_size = max_batch_size
|
||||
self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
|
||||
1)
|
||||
self._net_max_seq_len = net_max_seq_len
|
||||
self._dummy_reqs = None
|
||||
|
||||
@staticmethod
|
||||
def _get_cache_size_per_token(model_config: ModelConfig,
|
||||
@ -199,7 +199,7 @@ class KvCacheCreator:
|
||||
|
||||
if self._dummy_reqs is None:
|
||||
self._dummy_reqs = self._create_dummy_context_requests(
|
||||
max(1, self.net_max_seq_len - 1))
|
||||
max(1, self._net_max_seq_len - 1))
|
||||
for req in self._dummy_reqs:
|
||||
num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq
|
||||
# Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
|
||||
@ -482,7 +482,14 @@ class KvCacheCreator:
|
||||
)
|
||||
# KVCacheManager (Non-draft) modifies the max_seq_len field, update it to self
|
||||
if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:
|
||||
self._max_seq_len = kv_cache_manager.max_seq_len
|
||||
# When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
|
||||
if kv_cache_manager is not None:
|
||||
if kv_cache_manager.max_seq_len < self._max_seq_len:
|
||||
self._dummy_reqs = self._create_dummy_context_requests(
|
||||
max(
|
||||
1, self._net_max_seq_len - 1 -
|
||||
(self._max_seq_len - kv_cache_manager.max_seq_len)))
|
||||
self._max_seq_len = kv_cache_manager.max_seq_len
|
||||
|
||||
# When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
|
||||
if kv_cache_manager is not None:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user