[None][fix] Fix comments for kv cache manager v2 (#11207)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
Yi Zhang 2026-02-05 12:31:29 +08:00 committed by GitHub
parent 4adf76d860
commit ada463d15d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -23,7 +23,8 @@ from tensorrt_llm.lora_helper import LoraConfig
from tensorrt_llm.lora_manager import LoraManager, LoraModelConfig
from tensorrt_llm.math_utils import ceil_div
from tensorrt_llm.runtime import ModelConfig as ModelConfigPython
from tensorrt_llm.runtime.kv_cache_manager_v2 import (AttentionLayerConfig,
from tensorrt_llm.runtime.kv_cache_manager_v2 import (DEFAULT_BEAM_INDEX,
AttentionLayerConfig,
BufferConfig,
GpuCacheTierConfig,
HostCacheTierConfig)
@ -1704,7 +1705,7 @@ class KVCacheManagerV2(BaseResourceManager):
# Last token cannot be recovered, so we don't include it in the input tokens to look up for the block that can be reused.
kv_cache = self._create_kv_cache(
req.py_request_id, req.lora_task_id,
req.get_tokens(0)[:-1]
req.get_tokens(DEFAULT_BEAM_INDEX)[:-1]
if self.enable_block_reuse else None)
assert beam_width == 1, "Currently, KVCacheManagerV2 only supports beam width 1"
if not self.enable_block_reuse:
@ -1771,9 +1772,7 @@ class KVCacheManagerV2(BaseResourceManager):
max_num_draft_tokens: int = 0,
use_mrope: bool = False,
max_beam_width: int = 1,
num_extra_decoding_steps:
int = 0, # TODO: support num_extra_decoding_steps
):
num_extra_decoding_steps: int = 0):
beam_width = max_beam_width
requests = []
@ -1848,7 +1847,8 @@ class KVCacheManagerV2(BaseResourceManager):
return self._get_batch_cache_indices_by_pool_id(
request_ids,
pool_id=self.layer_to_pool_mapping_dict[layer_id],
pool_id=self.layer_to_pool_mapping_dict[
self.layer_offsets[layer_id]],
is_kv_aggregate=True)
def _get_batch_cache_indices_by_pool_id(
@ -2040,8 +2040,9 @@ class KVCacheManagerV2(BaseResourceManager):
if self.enable_block_reuse and not req.is_dummy_request:
if req.context_current_position > kv_cache.num_committed_tokens:
kv_cache.commit(
req.get_tokens(0)[kv_cache.num_committed_tokens:req.
context_current_position])
req.get_tokens(DEFAULT_BEAM_INDEX)
[kv_cache.num_committed_tokens:req.
context_current_position])
kv_cache.stop_committing()
else:
kv_cache.resize(None, req.context_current_position)