From 356a52edf56e2e40ff588fdfb63d61604608e248 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:14:30 -0800 Subject: [PATCH] [None][feat] Add support for KVCache reuse for DSv32 (#9383) Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> --- .../tensorrt_llm/batch_manager/kvCacheUtils.h | 4 ++++ cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp | 2 +- cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp | 9 +-------- examples/models/core/deepseek_v3/README.md | 9 --------- .../_torch/attention_backend/sparse/dsa.py | 12 ++++-------- .../defs/accuracy/test_disaggregated_serving.py | 2 -- .../defs/accuracy/test_llm_api_pytorch.py | 14 ++++---------- 7 files changed, 14 insertions(+), 38 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h index 77c2fcc497..90416d124c 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h @@ -183,6 +183,10 @@ private: auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx); mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx)); } + if (cacheManager.isEnableIndexerKCache()) + { + mIndexerKCachePool = cacheManager.getIndexerKCachePool(); + } } BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId) diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp index 5c61a182bd..07c1b83dbc 100644 --- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp @@ -806,7 +806,7 @@ public: RequestInfo requestInfo(requestId, mSelfState); - if (mFormatter->getCacheManager()->getBlockManager().getNumPools() == 1) + if (!mFormatter->getCacheManager()->getBlockManager().isVariableWindow()) { auto* cacheManager = mFormatter->getCacheManager(); auto beam = 0; diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index b69db6d1bc..7206ebd8e6 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -876,14 +876,7 @@ void WindowBlockManager::allocatePools(bool useUvm) } nvinfer1::Dims cacheShape; - if (pool.containsIndexerKCache) - { - cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, blockSize}); - } - else - { - cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize}); - } + cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize}); TLLM_LOG_DEBUG("[%s] Allocating primary pool with %d blocks for %d layers with %d kv heads", mLogPrefix.c_str(), mNumPrimaryBlocks, pool.numLayers, pool.numKvHeads); diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md index abef0d9b3b..3e82442563 100644 --- a/examples/models/core/deepseek_v3/README.md +++ b/examples/models/core/deepseek_v3/README.md @@ -881,12 +881,3 @@ python quickstart_advanced.py --model_dir --enable_chunked_pref - **GPU Memory:** Adjust `--max_batch_size` and `--max_num_tokens` if you encounter out-of-memory errors. - **Logs:** Check `/workspace/trt_bench.log` for detailed performance information and troubleshooting messages. - **Configuration Files:** Verify that the configuration files are correctly formatted to avoid runtime issues. - -## Known Issues -- Support for KV Cache Reuse and Chunked Prefill in DeepSeek-V3.2-Exp is currently under development. When running `quickstart_advanced.py`, please include `--disable_kv_cache_reuse` to disable KV Cache Reuse. When using `trtllm-eval`/`trtllm-serve`/`trtllm-bench`, please include the following configuration in the extra llm_api options: -``` -kv_cache_config: - enable_block_reuse: false - tokens_per_block: 64 -enable_chunked_prefill: false -``` diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py index ccdd1e25fd..a46752745a 100644 --- a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py +++ b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py @@ -930,7 +930,8 @@ class Indexer(nn.Module): start_idx=0, ) - if len(chunk_groups) > 1: + if len(chunk_groups + ) > 1 or metadata.enable_context_mla_with_cached_kv: metadata.indexer_prefill_chunks = [ Indexer.prepare_one_prefill_chunk( metadata, @@ -938,7 +939,6 @@ class Indexer(nn.Module): ) for chunk_specs in chunk_groups ] else: - # Single chunk - use non-chunked fallback path metadata.indexer_prefill_chunks = None host_cu_seqlen_ks, host_cu_seqlen_ke = compute_cu_seqlen_kv_bounds_with_cache( @@ -1018,9 +1018,9 @@ class Indexer(nn.Module): metadata.slot_mapping_scale[:total_tokens].copy_( metadata.host_slot_mapping_scale[:total_tokens], non_blocking=True) - # Only when MLA chunked prefill is enabled, we need to gather the full KV for indexer's logit computation. + # When chunked prefill or KVCache reuse is enabled, we need to gather the full KV for indexer's logit computation. # Indexer's own chunking does not need full KV gathering, instead it gathers only the current chunk with loop-based gathering. - _need_full_kv_gathering = num_contexts > 0 and has_mla_chunked_prefill + _need_full_kv_gathering = num_contexts > 0 and metadata.enable_context_mla_with_cached_kv if _need_full_kv_gathering: total_kv_len = metadata.host_ctx_kv_indptr[num_contexts].item() total_kv_per_request = seq_lens[: @@ -1589,10 +1589,6 @@ class DSACacheManager(KVCacheManager): sparse_attn_config: "SparseAttentionConfig", **kwargs, ) -> None: - - if kv_cache_config.enable_block_reuse: - raise NotImplementedError( - "DSA indexer K-cache manager does not support block reuse yet") self.quant_block_size = 128 self.index_head_dim = sparse_attn_config.index_head_dim # Use a fixed tokens_per_block for indexer k cache due to DG kernel constraints diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 2972e0ad4b..57e14f18c5 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -1055,7 +1055,6 @@ class TestDeepSeekV32Exp(LlmapiAccuracyTestHarness): ctx_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} ctx_server_config["kv_cache_config"] = { - "enable_block_reuse": False, "free_gpu_memory_fraction": 0.7, "tokens_per_block": 64, "dtype": "fp8" @@ -1072,7 +1071,6 @@ class TestDeepSeekV32Exp(LlmapiAccuracyTestHarness): ctx_server_config["enable_attention_dp"] = True ctx_server_config["enable_autotuner"] = False gen_server_config["kv_cache_config"] = { - "enable_block_reuse": False, "tokens_per_block": 64, "free_gpu_memory_fraction": 0.7, "dtype": "fp8" diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 75d636f130..ed04151db2 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2597,17 +2597,13 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): if get_sm_version() == 100 or get_sm_version() == 103: moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) - # TODO: Support block reuse for DeepSeek-V3.2 - kv_cache_config = KvCacheConfig(enable_block_reuse=False, - free_gpu_memory_fraction=0.6, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, tokens_per_block=64) else: if moe_backend != "_DEFAULT": pytest.skip("Not supported MoE backend!") moe_config = MoeConfig() - # TODO: Support block reuse for DeepSeek-V3.2 - kv_cache_config = KvCacheConfig(enable_block_reuse=False, - free_gpu_memory_fraction=0.7, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, tokens_per_block=64) pytorch_config = dict( @@ -2670,8 +2666,7 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): "MOE TRTLLM backend does not support SM version 120 or 121") moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) - kv_cache_config = KvCacheConfig(enable_block_reuse=False, - free_gpu_memory_fraction=0.7, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, tokens_per_block=64) cuda_graph_config = CudaGraphConfig( enable_padding=True, @@ -2730,8 +2725,7 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): "MOE TRTLLM backend does not support SM version 120 or 121") moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384) - kv_cache_config = KvCacheConfig(enable_block_reuse=False, - free_gpu_memory_fraction=0.7, + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, tokens_per_block=64) cuda_graph_config = CudaGraphConfig( enable_padding=True,