mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 15:55:08 +08:00
[None][fix] Fix comments for kv cache manager v2 (#11207)
Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
parent
4adf76d860
commit
ada463d15d
@ -23,7 +23,8 @@ from tensorrt_llm.lora_helper import LoraConfig
|
||||
from tensorrt_llm.lora_manager import LoraManager, LoraModelConfig
|
||||
from tensorrt_llm.math_utils import ceil_div
|
||||
from tensorrt_llm.runtime import ModelConfig as ModelConfigPython
|
||||
from tensorrt_llm.runtime.kv_cache_manager_v2 import (AttentionLayerConfig,
|
||||
from tensorrt_llm.runtime.kv_cache_manager_v2 import (DEFAULT_BEAM_INDEX,
|
||||
AttentionLayerConfig,
|
||||
BufferConfig,
|
||||
GpuCacheTierConfig,
|
||||
HostCacheTierConfig)
|
||||
@ -1704,7 +1705,7 @@ class KVCacheManagerV2(BaseResourceManager):
|
||||
# Last token cannot be recovered, so we don't include it in the input tokens to look up for the block that can be reused.
|
||||
kv_cache = self._create_kv_cache(
|
||||
req.py_request_id, req.lora_task_id,
|
||||
req.get_tokens(0)[:-1]
|
||||
req.get_tokens(DEFAULT_BEAM_INDEX)[:-1]
|
||||
if self.enable_block_reuse else None)
|
||||
assert beam_width == 1, "Currently, KVCacheManagerV2 only supports beam width 1"
|
||||
if not self.enable_block_reuse:
|
||||
@ -1771,9 +1772,7 @@ class KVCacheManagerV2(BaseResourceManager):
|
||||
max_num_draft_tokens: int = 0,
|
||||
use_mrope: bool = False,
|
||||
max_beam_width: int = 1,
|
||||
num_extra_decoding_steps:
|
||||
int = 0, # TODO: support num_extra_decoding_steps
|
||||
):
|
||||
num_extra_decoding_steps: int = 0):
|
||||
|
||||
beam_width = max_beam_width
|
||||
requests = []
|
||||
@ -1848,7 +1847,8 @@ class KVCacheManagerV2(BaseResourceManager):
|
||||
|
||||
return self._get_batch_cache_indices_by_pool_id(
|
||||
request_ids,
|
||||
pool_id=self.layer_to_pool_mapping_dict[layer_id],
|
||||
pool_id=self.layer_to_pool_mapping_dict[
|
||||
self.layer_offsets[layer_id]],
|
||||
is_kv_aggregate=True)
|
||||
|
||||
def _get_batch_cache_indices_by_pool_id(
|
||||
@ -2040,8 +2040,9 @@ class KVCacheManagerV2(BaseResourceManager):
|
||||
if self.enable_block_reuse and not req.is_dummy_request:
|
||||
if req.context_current_position > kv_cache.num_committed_tokens:
|
||||
kv_cache.commit(
|
||||
req.get_tokens(0)[kv_cache.num_committed_tokens:req.
|
||||
context_current_position])
|
||||
req.get_tokens(DEFAULT_BEAM_INDEX)
|
||||
[kv_cache.num_committed_tokens:req.
|
||||
context_current_position])
|
||||
kv_cache.stop_committing()
|
||||
else:
|
||||
kv_cache.resize(None, req.context_current_position)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user