From e010548b9d6790cf097b1df8d6e8e91e75364d36 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 5 Jun 2026 11:19:23 -0400 Subject: [PATCH] Remove unused is_kv_layout_blocks_first from TransferTopology Its only consumer was a diagnostic field in an AssertionError message. Drop the property, its backing field, and the error-message field; also correct the blocks-first comment to cover the quantized head-dim packing. Co-authored-by: Claude Opus 4.8 (1M context) Signed-off-by: Lucas Wilkinson --- .../kv_transfer/kv_connector/utils.py | 18 ++---------------- .../kv_transfer/kv_connector/v1/nixl/worker.py | 4 +--- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 5dccc367da3..1c7a8c9d830 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -405,8 +405,8 @@ class TransferTopology: self._engines: dict[EngineId, EngineTransferInfo] = {} - # Figure out whether the first dimension of the cache is K/V - # or num_blocks. + # Probe the per-layer cache shape (num_blocks mocked to 1) so we can + # detect cross-layer block layouts below. attn_backend = self.attn_backends[0] if not self.is_mamba: _MOCK_BLOCK_SIZE = 16 @@ -417,14 +417,6 @@ class TransferTopology: head_size=1, ) logger.debug("Test kv_cache_shape: %s", kv_cache_shape) - # In the standardized layout K and V are packed into the content dim, - # so attention caches are 4D [num_blocks, num_kv_heads, block_size, - # 2*head_size] with num_blocks leading (blocks-first). We mock - # num_blocks to 1 for the dimension check below. Hybrid SSM models also - # assume a blocks-first layout. - self._is_kv_layout_blocks_first = self.is_mamba or ( - len(kv_cache_shape) == 4 and kv_cache_shape[0] == 1 - ) self._cross_layers_blocks = False if self.tensor_shape is not None: @@ -475,10 +467,6 @@ class TransferTopology: # Layout properties # ============================================================ - @property - def is_kv_layout_blocks_first(self) -> bool: - return self._is_kv_layout_blocks_first - @property def cross_layers_blocks(self) -> bool: return self._cross_layers_blocks @@ -586,8 +574,6 @@ class TransferTopology: # Swap [2<>num_blocks] dims for hybrid SSM layout. cache = cache.transpose(0, 1) - # K and V are packed into one tensor (content dim), so each layer - # registers as a single region. return [cache] def describe(self, remote_engine_id: EngineId) -> str: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py index e8c52202a57..f479d5c127c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py @@ -917,9 +917,7 @@ class NixlConnectorWorker: f"backend={self.backend_name}, " "all_backends=" f"{[backend.get_name() for backend in self.attn_backends]}, " - f"kv_cache_layout={self.kv_cache_layout}, " - "blocks_first=" - f"{self.transfer_topo.is_kv_layout_blocks_first}" + f"kv_cache_layout={self.kv_cache_layout}" ) if not self.use_mla: