[fix] Fix illegal mem access and possible accuracy lose (#4943)

Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
This commit is contained in:
liji-nv 2025-06-08 11:19:42 +08:00 committed by GitHub
parent 20425deb3b
commit ff4212377c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 18 additions and 35 deletions

View File

@ -23,6 +23,7 @@ namespace tensorrt_llm
namespace kernels
{
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream);
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
cudaStream_t stream);
} // namespace kernels
} // namespace tensorrt_llm

View File

@ -90,8 +90,8 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
"Block shape is incorrect");
TLLM_CHECK_WITH_INFO(numTokensToCopy <= tokensPerBlock,
"numTokensToCopy (%d) must be <= tokensPerBlock (%d)", numTokensToCopy, tokensPerBlock);
tk::kvCacheBlockPartialCopy(
*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead, numTokensToCopy, stream);
tk::kvCacheBlockPartialCopy(*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead,
numTokensToCopy, kvFactor, stream);
}
}
}

View File

@ -58,7 +58,8 @@ unsigned int ipow2(unsigned int v)
template <typename T>
void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
cudaStream_t stream)
{
unsigned int blockX = ipow2(numHidden); // ensure block shape is a power of 2
blockX = std::min(blockX, 32u); // blockX should not exceed warp size
@ -75,12 +76,13 @@ void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int
auto srcData = bufferCast<T>(src);
auto dstData = bufferCast<T>(dst);
cuKVCacheBlockPartialCopy<<<grid, block, 0, stream>>>(
dstData, srcData, 2 * numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
dstData, srcData, numLayers * kvFactor, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
}
} // namespace
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
cudaStream_t stream)
{
auto dataType = src.getDataType();
TLLM_CHECK_WITH_INFO(dataType == dst.getDataType(), "src and dst dataType does not match");
@ -88,42 +90,42 @@ void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numL
{
case nvinfer1::DataType::kINT64:
hostKVCacheBlockPartialCopy<SizeType64>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
case nvinfer1::DataType::kINT32:
hostKVCacheBlockPartialCopy<std::int32_t>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
case nvinfer1::DataType::kFLOAT:
hostKVCacheBlockPartialCopy<float>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
#ifdef ENABLE_BF16
case nvinfer1::DataType::kBF16:
hostKVCacheBlockPartialCopy<__nv_bfloat16>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
#endif
case nvinfer1::DataType::kHALF:
hostKVCacheBlockPartialCopy<half>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
case nvinfer1::DataType::kBOOL:
hostKVCacheBlockPartialCopy<bool>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
case nvinfer1::DataType::kUINT8:
hostKVCacheBlockPartialCopy<std::uint8_t>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
case nvinfer1::DataType::kINT8:
hostKVCacheBlockPartialCopy<std::int8_t>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
#ifdef ENABLE_FP8
case nvinfer1::DataType::kFP8:
hostKVCacheBlockPartialCopy<__nv_fp8_e4m3>(
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
break;
#endif
default: TLLM_THROW("Unknown data type");

View File

@ -443,8 +443,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
[0, pytest.param(2, marks=skip_pre_hopper)])
def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
overlap_scheduler, torch_compile):
if torch_compile:
pytest.skip("https://nvbugs/5292037")
if torch_compile and mtp_nextn > 0:
pytest.skip("https://nvbugs/5252313")
if torch_compile and attention_dp:
@ -485,8 +483,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
attention_dp, cuda_graph, overlap_scheduler,
torch_compile):
if torch_compile:
pytest.skip("https://nvbugs/5292037")
if torch_compile and mtp_nextn > 0:
pytest.skip("https://nvbugs/5252313")
if torch_compile and attention_dp:
@ -529,8 +525,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
@parametrize_with_ids("mtp_nextn", [0, 2])
def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
overlap_scheduler, torch_compile):
if torch_compile:
pytest.skip("https://nvbugs/5292037")
if torch_compile and mtp_nextn > 0:
pytest.skip("https://nvbugs/5252313")
if torch_compile and attention_dp:
@ -609,8 +603,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
fp8kv, attention_dp, cuda_graph,
overlap_scheduler, torch_compile):
if torch_compile:
pytest.skip("https://nvbugs/5292037")
if torch_compile and mtp_nextn > 0:
pytest.skip("https://nvbugs/5252313")
if torch_compile and attention_dp:

View File

@ -407,7 +407,6 @@ accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://n
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5231468)
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image] SKIP (https://nvbugs/5233423)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5294983)
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
@ -436,17 +435,6 @@ triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-deco
triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble] SKIP
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5285965)
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523)
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523)
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5289904)