[Attention][CPU] Standardize kv layout to blocks first (#44393)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2026-06-06 00:16:14 +00:00 · 2026-06-03 19:03:09 +08:00
parent 95b1615ec9
commit 823d271c0d
2 changed files with 14 additions and 7 deletions
@@ -258,10 +258,13 @@ def varlen_with_paged_kv(

    # KV cache for CPU attention
    cache_dtype = torch.uint8 if is_fp8 else dtype
-    packed_key_cache = torch.empty(
-        num_blocks, num_kv_heads, block_size, head_size, dtype=cache_dtype
+    packed_key_value_cache = torch.empty(
+        num_blocks, num_kv_heads, block_size, head_size * 2, dtype=cache_dtype
    )
-    packed_value_cache = torch.empty_like(packed_key_cache)
+    packed_key_value_cache = packed_key_value_cache.view(
+        (num_blocks, num_kv_heads, block_size * 2, -1)
+    )
+    packed_key_cache, packed_value_cache = packed_key_value_cache.chunk(2, dim=2)

    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
        dim=0, dtype=torch.int32