[fix] Fix illegal mem access and possible accuracy lose (#4943)

Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-06-08 11:19:42 +08:00 · 2025-06-08 11:19:42 +08:00 · ff4212377c
commit ff4212377c
parent 20425deb3b
5 changed files with 18 additions and 35 deletions
--- a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h
+++ b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h
@ -23,6 +23,7 @@ namespace tensorrt_llm
 namespace kernels
 {
 void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
-    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream);
+    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
    cudaStream_t stream);
 } // namespace kernels
 } // namespace tensorrt_llm
--- a/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp
@ -90,8 +90,8 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
                    "Block shape is incorrect");
                TLLM_CHECK_WITH_INFO(numTokensToCopy <= tokensPerBlock,
                    "numTokensToCopy (%d) must be <= tokensPerBlock (%d)", numTokensToCopy, tokensPerBlock);
-                tk::kvCacheBlockPartialCopy(
+                tk::kvCacheBlockPartialCopy(*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead,
-                    *dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead, numTokensToCopy, stream);
+                    numTokensToCopy, kvFactor, stream);
            }
        }
    }
--- a/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu
+++ b/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu
@ -58,7 +58,8 @@ unsigned int ipow2(unsigned int v)
 template <typename T>
 void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
-    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
+    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
    cudaStream_t stream)
 {
    unsigned int blockX = ipow2(numHidden);         // ensure block shape is a power of 2
    blockX = std::min(blockX, 32u);                 // blockX should not exceed warp size
@ -75,12 +76,13 @@ void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int
    auto srcData = bufferCast<T>(src);
    auto dstData = bufferCast<T>(dst);
    cuKVCacheBlockPartialCopy<<<grid, block, 0, stream>>>(
-        dstData, srcData, 2 * numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
+        dstData, srcData, numLayers * kvFactor, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
 }
 } // namespace
 void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
-    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
+    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
    cudaStream_t stream)
 {
    auto dataType = src.getDataType();
    TLLM_CHECK_WITH_INFO(dataType == dst.getDataType(), "src and dst dataType does not match");
@ -88,42 +90,42 @@ void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numL
    {
    case nvinfer1::DataType::kINT64:
        hostKVCacheBlockPartialCopy<SizeType64>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
    case nvinfer1::DataType::kINT32:
        hostKVCacheBlockPartialCopy<std::int32_t>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
    case nvinfer1::DataType::kFLOAT:
        hostKVCacheBlockPartialCopy<float>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
 #ifdef ENABLE_BF16
    case nvinfer1::DataType::kBF16:
        hostKVCacheBlockPartialCopy<__nv_bfloat16>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
 #endif
    case nvinfer1::DataType::kHALF:
        hostKVCacheBlockPartialCopy<half>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
    case nvinfer1::DataType::kBOOL:
        hostKVCacheBlockPartialCopy<bool>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
    case nvinfer1::DataType::kUINT8:
        hostKVCacheBlockPartialCopy<std::uint8_t>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
    case nvinfer1::DataType::kINT8:
        hostKVCacheBlockPartialCopy<std::int8_t>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
 #ifdef ENABLE_FP8
    case nvinfer1::DataType::kFP8:
        hostKVCacheBlockPartialCopy<__nv_fp8_e4m3>(
-            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
+            dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
        break;
 #endif
    default: TLLM_THROW("Unknown data type");
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -443,8 +443,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                          [0, pytest.param(2, marks=skip_pre_hopper)])
    def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
                      overlap_scheduler, torch_compile):
        if torch_compile:
            pytest.skip("https://nvbugs/5292037")
        if torch_compile and mtp_nextn > 0:
            pytest.skip("https://nvbugs/5252313")
        if torch_compile and attention_dp:
@ -485,8 +483,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                            attention_dp, cuda_graph, overlap_scheduler,
                            torch_compile):
        if torch_compile:
            pytest.skip("https://nvbugs/5292037")
        if torch_compile and mtp_nextn > 0:
            pytest.skip("https://nvbugs/5252313")
        if torch_compile and attention_dp:
@ -529,8 +525,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    @parametrize_with_ids("mtp_nextn", [0, 2])
    def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
                              overlap_scheduler, torch_compile):
        if torch_compile:
            pytest.skip("https://nvbugs/5292037")
        if torch_compile and mtp_nextn > 0:
            pytest.skip("https://nvbugs/5252313")
        if torch_compile and attention_dp:
@ -609,8 +603,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                    fp8kv, attention_dp, cuda_graph,
                                    overlap_scheduler, torch_compile):
        if torch_compile:
            pytest.skip("https://nvbugs/5292037")
        if torch_compile and mtp_nextn > 0:
            pytest.skip("https://nvbugs/5252313")
        if torch_compile and attention_dp:
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -407,7 +407,6 @@ accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://n
 accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5231468)
 accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image] SKIP (https://nvbugs/5233423)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5294983)
 examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
@ -436,17 +435,6 @@ triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-deco
 triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
 triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
 triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble] SKIP
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5285965)
 examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523)
 examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523)
 examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5289904)