mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[fix] Fix illegal mem access and possible accuracy lose (#4943)
Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
This commit is contained in:
parent
20425deb3b
commit
ff4212377c
@ -23,6 +23,7 @@ namespace tensorrt_llm
|
||||
namespace kernels
|
||||
{
|
||||
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream);
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||
cudaStream_t stream);
|
||||
} // namespace kernels
|
||||
} // namespace tensorrt_llm
|
||||
|
||||
@ -90,8 +90,8 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
|
||||
"Block shape is incorrect");
|
||||
TLLM_CHECK_WITH_INFO(numTokensToCopy <= tokensPerBlock,
|
||||
"numTokensToCopy (%d) must be <= tokensPerBlock (%d)", numTokensToCopy, tokensPerBlock);
|
||||
tk::kvCacheBlockPartialCopy(
|
||||
*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead, numTokensToCopy, stream);
|
||||
tk::kvCacheBlockPartialCopy(*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead,
|
||||
numTokensToCopy, kvFactor, stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -58,7 +58,8 @@ unsigned int ipow2(unsigned int v)
|
||||
|
||||
template <typename T>
|
||||
void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
unsigned int blockX = ipow2(numHidden); // ensure block shape is a power of 2
|
||||
blockX = std::min(blockX, 32u); // blockX should not exceed warp size
|
||||
@ -75,12 +76,13 @@ void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int
|
||||
auto srcData = bufferCast<T>(src);
|
||||
auto dstData = bufferCast<T>(dst);
|
||||
cuKVCacheBlockPartialCopy<<<grid, block, 0, stream>>>(
|
||||
dstData, srcData, 2 * numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
|
||||
dstData, srcData, numLayers * kvFactor, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
|
||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
auto dataType = src.getDataType();
|
||||
TLLM_CHECK_WITH_INFO(dataType == dst.getDataType(), "src and dst dataType does not match");
|
||||
@ -88,42 +90,42 @@ void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numL
|
||||
{
|
||||
case nvinfer1::DataType::kINT64:
|
||||
hostKVCacheBlockPartialCopy<SizeType64>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
case nvinfer1::DataType::kINT32:
|
||||
hostKVCacheBlockPartialCopy<std::int32_t>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
case nvinfer1::DataType::kFLOAT:
|
||||
hostKVCacheBlockPartialCopy<float>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
#ifdef ENABLE_BF16
|
||||
case nvinfer1::DataType::kBF16:
|
||||
hostKVCacheBlockPartialCopy<__nv_bfloat16>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
#endif
|
||||
case nvinfer1::DataType::kHALF:
|
||||
hostKVCacheBlockPartialCopy<half>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
case nvinfer1::DataType::kBOOL:
|
||||
hostKVCacheBlockPartialCopy<bool>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
case nvinfer1::DataType::kUINT8:
|
||||
hostKVCacheBlockPartialCopy<std::uint8_t>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
case nvinfer1::DataType::kINT8:
|
||||
hostKVCacheBlockPartialCopy<std::int8_t>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
#ifdef ENABLE_FP8
|
||||
case nvinfer1::DataType::kFP8:
|
||||
hostKVCacheBlockPartialCopy<__nv_fp8_e4m3>(
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||
break;
|
||||
#endif
|
||||
default: TLLM_THROW("Unknown data type");
|
||||
|
||||
@ -443,8 +443,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
[0, pytest.param(2, marks=skip_pre_hopper)])
|
||||
def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
|
||||
overlap_scheduler, torch_compile):
|
||||
if torch_compile:
|
||||
pytest.skip("https://nvbugs/5292037")
|
||||
if torch_compile and mtp_nextn > 0:
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
if torch_compile and attention_dp:
|
||||
@ -485,8 +483,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
||||
attention_dp, cuda_graph, overlap_scheduler,
|
||||
torch_compile):
|
||||
if torch_compile:
|
||||
pytest.skip("https://nvbugs/5292037")
|
||||
if torch_compile and mtp_nextn > 0:
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
if torch_compile and attention_dp:
|
||||
@ -529,8 +525,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
@parametrize_with_ids("mtp_nextn", [0, 2])
|
||||
def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
|
||||
overlap_scheduler, torch_compile):
|
||||
if torch_compile:
|
||||
pytest.skip("https://nvbugs/5292037")
|
||||
if torch_compile and mtp_nextn > 0:
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
if torch_compile and attention_dp:
|
||||
@ -609,8 +603,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
||||
fp8kv, attention_dp, cuda_graph,
|
||||
overlap_scheduler, torch_compile):
|
||||
if torch_compile:
|
||||
pytest.skip("https://nvbugs/5292037")
|
||||
if torch_compile and mtp_nextn > 0:
|
||||
pytest.skip("https://nvbugs/5252313")
|
||||
if torch_compile and attention_dp:
|
||||
|
||||
@ -407,7 +407,6 @@ accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://n
|
||||
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5231468)
|
||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image] SKIP (https://nvbugs/5233423)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5294983)
|
||||
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
||||
@ -436,17 +435,6 @@ triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-deco
|
||||
triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
|
||||
triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
|
||||
triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble] SKIP
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5285965)
|
||||
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523)
|
||||
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523)
|
||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5289904)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user