[None][fix] Fix comments for kv cache manager v2 (#11207)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
2026-02-16 15:55:08 +08:00 · 2026-02-05 12:31:29 +08:00 · 2026-02-05 12:31:29 +08:00 · ada463d15d
commit ada463d15d
parent 4adf76d860
1 changed files with 9 additions and 8 deletions
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@ -23,7 +23,8 @@ from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.lora_manager import LoraManager, LoraModelConfig
 from tensorrt_llm.math_utils import ceil_div
 from tensorrt_llm.runtime import ModelConfig as ModelConfigPython
-from tensorrt_llm.runtime.kv_cache_manager_v2 import (AttentionLayerConfig,
+from tensorrt_llm.runtime.kv_cache_manager_v2 import (DEFAULT_BEAM_INDEX,
+                                                      AttentionLayerConfig,
                                                      BufferConfig,
                                                      GpuCacheTierConfig,
                                                      HostCacheTierConfig)
@ -1704,7 +1705,7 @@ class KVCacheManagerV2(BaseResourceManager):
                        # Last token cannot be recovered, so we don't include it in the input tokens to look up for the block that can be reused.
                        kv_cache = self._create_kv_cache(
                            req.py_request_id, req.lora_task_id,
-                            req.get_tokens(0)[:-1]
+                            req.get_tokens(DEFAULT_BEAM_INDEX)[:-1]
                            if self.enable_block_reuse else None)
                        assert beam_width == 1, "Currently, KVCacheManagerV2 only supports beam width 1"
                        if not self.enable_block_reuse:
@ -1771,9 +1772,7 @@ class KVCacheManagerV2(BaseResourceManager):
            max_num_draft_tokens: int = 0,
            use_mrope: bool = False,
            max_beam_width: int = 1,
-            num_extra_decoding_steps:
-        int = 0,  # TODO: support num_extra_decoding_steps
-    ):
+            num_extra_decoding_steps: int = 0):

        beam_width = max_beam_width
        requests = []
@ -1848,7 +1847,8 @@ class KVCacheManagerV2(BaseResourceManager):

        return self._get_batch_cache_indices_by_pool_id(
            request_ids,
-            pool_id=self.layer_to_pool_mapping_dict[layer_id],
+            pool_id=self.layer_to_pool_mapping_dict[
+                self.layer_offsets[layer_id]],
            is_kv_aggregate=True)

    def _get_batch_cache_indices_by_pool_id(
@ -2040,8 +2040,9 @@ class KVCacheManagerV2(BaseResourceManager):
            if self.enable_block_reuse and not req.is_dummy_request:
                if req.context_current_position > kv_cache.num_committed_tokens:
                    kv_cache.commit(
-                        req.get_tokens(0)[kv_cache.num_committed_tokens:req.
-                                          context_current_position])
+                        req.get_tokens(DEFAULT_BEAM_INDEX)
+                        [kv_cache.num_committed_tokens:req.
+                         context_current_position])
                kv_cache.stop_committing()
            else:
                kv_cache.resize(None, req.context_current_position)