mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[fix] Fix illegal mem access and possible accuracy lose (#4943)
Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
This commit is contained in:
parent
20425deb3b
commit
ff4212377c
@ -23,6 +23,7 @@ namespace tensorrt_llm
|
|||||||
namespace kernels
|
namespace kernels
|
||||||
{
|
{
|
||||||
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream);
|
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||||
|
cudaStream_t stream);
|
||||||
} // namespace kernels
|
} // namespace kernels
|
||||||
} // namespace tensorrt_llm
|
} // namespace tensorrt_llm
|
||||||
|
|||||||
@ -90,8 +90,8 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
|
|||||||
"Block shape is incorrect");
|
"Block shape is incorrect");
|
||||||
TLLM_CHECK_WITH_INFO(numTokensToCopy <= tokensPerBlock,
|
TLLM_CHECK_WITH_INFO(numTokensToCopy <= tokensPerBlock,
|
||||||
"numTokensToCopy (%d) must be <= tokensPerBlock (%d)", numTokensToCopy, tokensPerBlock);
|
"numTokensToCopy (%d) must be <= tokensPerBlock (%d)", numTokensToCopy, tokensPerBlock);
|
||||||
tk::kvCacheBlockPartialCopy(
|
tk::kvCacheBlockPartialCopy(*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead,
|
||||||
*dstPtr, *srcPtr, numLayers, numHeads, tokensPerBlock, sizePerHead, numTokensToCopy, stream);
|
numTokensToCopy, kvFactor, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -58,7 +58,8 @@ unsigned int ipow2(unsigned int v)
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
|
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||||
|
cudaStream_t stream)
|
||||||
{
|
{
|
||||||
unsigned int blockX = ipow2(numHidden); // ensure block shape is a power of 2
|
unsigned int blockX = ipow2(numHidden); // ensure block shape is a power of 2
|
||||||
blockX = std::min(blockX, 32u); // blockX should not exceed warp size
|
blockX = std::min(blockX, 32u); // blockX should not exceed warp size
|
||||||
@ -75,12 +76,13 @@ void hostKVCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int
|
|||||||
auto srcData = bufferCast<T>(src);
|
auto srcData = bufferCast<T>(src);
|
||||||
auto dstData = bufferCast<T>(dst);
|
auto dstData = bufferCast<T>(dst);
|
||||||
cuKVCacheBlockPartialCopy<<<grid, block, 0, stream>>>(
|
cuKVCacheBlockPartialCopy<<<grid, block, 0, stream>>>(
|
||||||
dstData, srcData, 2 * numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
|
dstData, srcData, numLayers * kvFactor, numHeads, tokensPerBlock, numHidden, numTokensToCopy);
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
|
||||||
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, cudaStream_t stream)
|
unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
|
||||||
|
cudaStream_t stream)
|
||||||
{
|
{
|
||||||
auto dataType = src.getDataType();
|
auto dataType = src.getDataType();
|
||||||
TLLM_CHECK_WITH_INFO(dataType == dst.getDataType(), "src and dst dataType does not match");
|
TLLM_CHECK_WITH_INFO(dataType == dst.getDataType(), "src and dst dataType does not match");
|
||||||
@ -88,42 +90,42 @@ void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numL
|
|||||||
{
|
{
|
||||||
case nvinfer1::DataType::kINT64:
|
case nvinfer1::DataType::kINT64:
|
||||||
hostKVCacheBlockPartialCopy<SizeType64>(
|
hostKVCacheBlockPartialCopy<SizeType64>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
case nvinfer1::DataType::kINT32:
|
case nvinfer1::DataType::kINT32:
|
||||||
hostKVCacheBlockPartialCopy<std::int32_t>(
|
hostKVCacheBlockPartialCopy<std::int32_t>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
case nvinfer1::DataType::kFLOAT:
|
case nvinfer1::DataType::kFLOAT:
|
||||||
hostKVCacheBlockPartialCopy<float>(
|
hostKVCacheBlockPartialCopy<float>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
#ifdef ENABLE_BF16
|
#ifdef ENABLE_BF16
|
||||||
case nvinfer1::DataType::kBF16:
|
case nvinfer1::DataType::kBF16:
|
||||||
hostKVCacheBlockPartialCopy<__nv_bfloat16>(
|
hostKVCacheBlockPartialCopy<__nv_bfloat16>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case nvinfer1::DataType::kHALF:
|
case nvinfer1::DataType::kHALF:
|
||||||
hostKVCacheBlockPartialCopy<half>(
|
hostKVCacheBlockPartialCopy<half>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
case nvinfer1::DataType::kBOOL:
|
case nvinfer1::DataType::kBOOL:
|
||||||
hostKVCacheBlockPartialCopy<bool>(
|
hostKVCacheBlockPartialCopy<bool>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
case nvinfer1::DataType::kUINT8:
|
case nvinfer1::DataType::kUINT8:
|
||||||
hostKVCacheBlockPartialCopy<std::uint8_t>(
|
hostKVCacheBlockPartialCopy<std::uint8_t>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
case nvinfer1::DataType::kINT8:
|
case nvinfer1::DataType::kINT8:
|
||||||
hostKVCacheBlockPartialCopy<std::int8_t>(
|
hostKVCacheBlockPartialCopy<std::int8_t>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
#ifdef ENABLE_FP8
|
#ifdef ENABLE_FP8
|
||||||
case nvinfer1::DataType::kFP8:
|
case nvinfer1::DataType::kFP8:
|
||||||
hostKVCacheBlockPartialCopy<__nv_fp8_e4m3>(
|
hostKVCacheBlockPartialCopy<__nv_fp8_e4m3>(
|
||||||
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, stream);
|
dst, src, numLayers, numHeads, tokensPerBlock, numHidden, numTokensToCopy, kvFactor, stream);
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
default: TLLM_THROW("Unknown data type");
|
default: TLLM_THROW("Unknown data type");
|
||||||
|
|||||||
@ -443,8 +443,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
[0, pytest.param(2, marks=skip_pre_hopper)])
|
[0, pytest.param(2, marks=skip_pre_hopper)])
|
||||||
def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
|
def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
|
||||||
overlap_scheduler, torch_compile):
|
overlap_scheduler, torch_compile):
|
||||||
if torch_compile:
|
|
||||||
pytest.skip("https://nvbugs/5292037")
|
|
||||||
if torch_compile and mtp_nextn > 0:
|
if torch_compile and mtp_nextn > 0:
|
||||||
pytest.skip("https://nvbugs/5252313")
|
pytest.skip("https://nvbugs/5252313")
|
||||||
if torch_compile and attention_dp:
|
if torch_compile and attention_dp:
|
||||||
@ -485,8 +483,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
||||||
attention_dp, cuda_graph, overlap_scheduler,
|
attention_dp, cuda_graph, overlap_scheduler,
|
||||||
torch_compile):
|
torch_compile):
|
||||||
if torch_compile:
|
|
||||||
pytest.skip("https://nvbugs/5292037")
|
|
||||||
if torch_compile and mtp_nextn > 0:
|
if torch_compile and mtp_nextn > 0:
|
||||||
pytest.skip("https://nvbugs/5252313")
|
pytest.skip("https://nvbugs/5252313")
|
||||||
if torch_compile and attention_dp:
|
if torch_compile and attention_dp:
|
||||||
@ -529,8 +525,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
@parametrize_with_ids("mtp_nextn", [0, 2])
|
@parametrize_with_ids("mtp_nextn", [0, 2])
|
||||||
def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
|
def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
|
||||||
overlap_scheduler, torch_compile):
|
overlap_scheduler, torch_compile):
|
||||||
if torch_compile:
|
|
||||||
pytest.skip("https://nvbugs/5292037")
|
|
||||||
if torch_compile and mtp_nextn > 0:
|
if torch_compile and mtp_nextn > 0:
|
||||||
pytest.skip("https://nvbugs/5252313")
|
pytest.skip("https://nvbugs/5252313")
|
||||||
if torch_compile and attention_dp:
|
if torch_compile and attention_dp:
|
||||||
@ -609,8 +603,6 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
||||||
fp8kv, attention_dp, cuda_graph,
|
fp8kv, attention_dp, cuda_graph,
|
||||||
overlap_scheduler, torch_compile):
|
overlap_scheduler, torch_compile):
|
||||||
if torch_compile:
|
|
||||||
pytest.skip("https://nvbugs/5292037")
|
|
||||||
if torch_compile and mtp_nextn > 0:
|
if torch_compile and mtp_nextn > 0:
|
||||||
pytest.skip("https://nvbugs/5252313")
|
pytest.skip("https://nvbugs/5252313")
|
||||||
if torch_compile and attention_dp:
|
if torch_compile and attention_dp:
|
||||||
|
|||||||
@ -407,7 +407,6 @@ accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://n
|
|||||||
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5231468)
|
accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5231468)
|
||||||
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
|
accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310)
|
||||||
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image] SKIP (https://nvbugs/5233423)
|
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image] SKIP (https://nvbugs/5233423)
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5294983)
|
|
||||||
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
|
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164)
|
||||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
||||||
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058)
|
||||||
@ -436,17 +435,6 @@ triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-deco
|
|||||||
triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
|
triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] SKIP
|
||||||
triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
|
triton_server/test_triton.py::test_gpt_ib_speculative_decoding_bls[gpt-ib-speculative-decoding-bls] SKIP
|
||||||
triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble] SKIP
|
triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble] SKIP
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5285965)
|
|
||||||
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523)
|
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523)
|
||||||
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523)
|
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523)
|
||||||
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5289904)
|
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5289904)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user