/* * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement * * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual * property and proprietary rights in and to this material, related * documentation and any modifications thereto. Any use, reproduction, * disclosure or distribution of this material and related documentation * without an express license agreement from NVIDIA CORPORATION or * its affiliates is strictly prohibited. */ #include "tensorrt_llm/batch_manager/kvCacheManager.h" #include "tensorrt_llm/batch_manager/common.h" #include "tensorrt_llm/batch_manager/kvCacheEventManager.h" #include "tensorrt_llm/batch_manager/kvCacheTransferManager.h" #include "tensorrt_llm/batch_manager/llmRequest.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/kernels/kvCacheIndex.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/samplingConfig.h" #include "gtest/gtest.h" #include #include #include #include #include #include #include #include using namespace tensorrt_llm::batch_manager; using namespace tensorrt_llm::batch_manager::kv_cache_manager; using namespace tensorrt_llm::executor; namespace tc = tensorrt_llm::common; namespace tk = tensorrt_llm::kernels; namespace tlk = tensorrt_llm::batch_manager::kv_cache_manager; namespace tle = tensorrt_llm::executor; namespace tr = tensorrt_llm::runtime; using ParamType = bool; namespace { std::string generateTestName(testing::TestParamInfo const& info) { auto const homogeneousLayers = info.param; std::string name = "KVCacheManagerTest"; if (homogeneousLayers) { name += "Homogeneous"; } else { name += "Heterogeneous"; } return name; } SizeType32 theOnlyWindowSize(KVCacheManager const& kvCacheManager) { auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getWindowSizesMetadata().size(), 1) << "Assuming a single window size"; return blockManager.getPoolWindowSize(0); } } // namespace class KVCacheManagerTest : public ::testing::Test, public ::testing::WithParamInterface // NOLINT(cppcoreguidelines-pro-type-member-init) { protected: void SetUp() override { auto const deviceCount = tc::getDeviceCount(); if (deviceCount == 0) { GTEST_SKIP(); } } void TearDown() override {} }; namespace { // TODO: This is really ugly. Flushing the event queue is done in a separate thread, so if we want to check the value we // need to wait for the thread to complete. It works, but it's technically not deterministic. std::deque getEvents(KVCacheManager& kvCacheManager) { kvCacheManager.flushIterationEvents(); std::this_thread::sleep_for(std::chrono::milliseconds(100)); return kvCacheManager.getLatestEvents(std::chrono::milliseconds(100)); } } // namespace TEST_F(KVCacheManagerTest, BlockManagerTest) { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 128; auto constexpr tokensPerBlock = 64; auto constexpr blocksInPrimaryPool = 24; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr beamWidth = 8; auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth; auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam; auto constexpr maxAttentionWindow = numTokens; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); auto constexpr requestId = 42; GenerationRequest seq0{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow); auto constexpr occupiedBlocks = (numBlocksPerBeam - 1) + beamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - occupiedBlocks); auto const& ids = seq0.getCacheBlockIds(maxAttentionWindow); std::set idSet{}; EXPECT_EQ(ids.size(), beamWidth); for (auto const& beam : ids) { EXPECT_EQ(beam.size(), blocksInPrimaryPool / beamWidth); idSet.insert(beam.begin(), beam.end()); } EXPECT_EQ(idSet.size(), occupiedBlocks); blockManager.releaseBlocks(seq0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); blockManager.addSequence(seq0, numBlocksPerBeam, -1, maxAttentionWindow); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocksPerBeam); EXPECT_EQ(ids.size(), beamWidth); for (std::size_t i = 0u; i < ids.front().size(); ++i) { for (std::size_t beam = 1u; beam < ids.size(); ++beam) { EXPECT_EQ(ids.at(beam).at(i), ids.at(0).at(i)); } } blockManager.releaseBlocks(seq0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); // occupy 22/24 blocks EXPECT_NO_THROW(blockManager.addSequence(seq0, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow)); GenerationRequest seq1{requestId + 1, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; EXPECT_NO_THROW(blockManager.addSequence(seq1, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow)); // same requestId not allowed GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; EXPECT_THROW( blockManager.addSequence(seq2, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow), std::runtime_error); // no more blocks GenerationRequest seq3{requestId + 2, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; EXPECT_THROW( blockManager.addSequence(seq3, numBlocksPerBeam, numBlocksPerBeam - 1, maxAttentionWindow), std::runtime_error); } template void runPartialCopyTest() { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 128; auto constexpr tokensPerBlock = 8; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 4; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr batchSize = 1; auto constexpr maxBlocksPerSeq = 10; auto constexpr bytesPerToken = 4; auto constexpr maxAttentionWindow = 4096; auto constexpr maxAttentionWindowAllLayer = 4096; auto constexpr sinkTokenLen = 0; auto constexpr canUseOneMoreBlock = true; SizeType32 constexpr maxNewTokens{0}; auto constexpr beamWidth = 1; auto constexpr beamIdx = 0; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, type, 0, onboardBlocks); blockManager.allocatePools(false); auto oneLayerBlockSize = blockManager.getBlockSize(0); EXPECT_EQ(oneLayerBlockSize, numKvHeads * sizePerHead * tokensPerBlock); auto primaryPoolPtr = blockManager.getPrimaryPool(0); auto secondaryPoolPtr = blockManager.getSecondaryPool(0); tk::KVBlockArray kvCacheBlockArray(batchSize, maxBlocksPerSeq, tokensPerBlock, bytesPerToken, maxAttentionWindow, maxAttentionWindowAllLayer, sinkTokenLen, canUseOneMoreBlock, primaryPoolPtr->data(), secondaryPoolPtr->data(), nullptr); // Verify that shape of block for one layer of K or V is [numKvHeads, tokensPerBlock, sizePerHead] by comparing // against KVBlockArray::getKVLocalIdx method. We make this assumption in partialCopy kernel. auto constexpr localTokenIdx = 3; auto constexpr headIdx = 5; auto constexpr channelIdx = 7; auto localKIdx = kvCacheBlockArray.getKVLocalIdx(localTokenIdx, headIdx, sizePerHead, channelIdx); EXPECT_EQ(localKIdx, (headIdx * tokensPerBlock + localTokenIdx) * sizePerHead + channelIdx); // Pool block has shape [2, numLayers, numKvHeads, tokensPerBlock, sizePerHead] auto blockSize = 2 * numLayers * oneLayerBlockSize; auto primaryPoolSize = blocksInPrimaryPool * blockSize; auto secondaryPoolSize = blocksInSecondaryPool * blockSize; // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] (17 tokens, three blocks) auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); auto cacheBlockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds, ::testing::ElementsAreArray({0, 1, 2})); // Offload all 3 blocks, fill with predictable pattern, onboard for (auto cacheBlockId : cacheBlockIds) { auto block = blockManager.getBlockById(cacheBlockId, maxAttentionWindow); EXPECT_TRUE(block->isPrimary()); // offload so we can write to block in CPU code blockManager.offloadBlock(block, maxAttentionWindow); EXPECT_FALSE(block->isPrimary()); // need to sync so D2H transfer is done before accessing blocks EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); // fill with predictable pattern auto memoryPoolIndex = block->getMemoryPoolBlockIndex(); auto blockPtr{tr::ITensor::slice(secondaryPoolPtr, memoryPoolIndex, 1)}; auto rawBlockPtr = reinterpret_cast(blockPtr->data()); for (int i = 0; i < blockSize; ++i) { rawBlockPtr[i] = i & mask; } // onboard blockManager.onboardBlock(block, maxAttentionWindow); EXPECT_TRUE(block->isPrimary()); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); EXPECT_TRUE(blockManager.verifyQueueIntegrity(maxAttentionWindow)); } blockManager.releaseBlocks(seq0, llmRequest0); // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] auto inputTokens1 = inputTokens; auto const inputLength1 = static_cast(inputTokens1->size()); requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming); GenerationRequest seq1{requestId, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 16); auto cacheBlockIds1 = seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds1, ::testing::ElementsAreArray({0, 1, 6})); // store blocks 0, 1 ([0,1,2,3,4,5,6,7], [8,9,10,11,12,13,14,15]) blockManager.storeContextBlocks(seq1, *llmRequest1); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11] again. // Reuse blocks 0 and 1(pc). Block 1 is partially reused, but already referenced by seq1 so must be partial copied // into new block 2. Clear block 2 so we can see what was partial copied. auto block2 = blockManager.getBlockById(2, maxAttentionWindow); auto memoryPoolIndex2 = block2->getMemoryPoolBlockIndex(); auto block2Ptr{tr::ITensor::slice(primaryPoolPtr, memoryPoolIndex2, 1)}; EXPECT_EQ(cudaMemset(block2Ptr->data(), 0, blockSize * sizeof(T)), cudaSuccess); auto inputTokens2 = inputTokens; auto constexpr partiallyReusedTokens = 3; inputTokens2->resize(8 + partiallyReusedTokens + 1); auto const inputLength2 = static_cast(inputTokens2->size()); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming); GenerationRequest seq2{requestId, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 11); auto cacheBlockIds2 = seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds2, ::testing::ElementsAreArray({0, 2})); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); // Verify partial copied block 2 // Block has shape [2, numLayers, numKvHeads, tokensPerBlock, sizePerHead] blockManager.offloadBlock(block2, maxAttentionWindow); EXPECT_FALSE(block2->isPrimary()); // need to sync so D2H transfer is done before accessing blocks EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); memoryPoolIndex2 = block2->getMemoryPoolBlockIndex(); block2Ptr = tr::ITensor::slice(secondaryPoolPtr, memoryPoolIndex2, 1); T const* rawPtr2 = reinterpret_cast(block2Ptr->data()); int numBad = 0; for (int i = 0; i < blockSize && numBad < 10; ++i) { T value = rawPtr2[i]; int kOrV = i / (numLayers * numKvHeads * tokensPerBlock * sizePerHead); int j = i - kOrV * (numLayers * numKvHeads * tokensPerBlock * sizePerHead); int layer = j / (numKvHeads * tokensPerBlock * sizePerHead); j = j - layer * (numKvHeads * tokensPerBlock * sizePerHead); int head = j / (tokensPerBlock * sizePerHead); j = j - head * (tokensPerBlock * sizePerHead); int token = j / sizePerHead; j = j - token * sizePerHead; T expectedValue = (token < partiallyReusedTokens) ? i & mask : 0; if (value != expectedValue) { TLLM_LOG_WARNING( "block2[%d,%d,%d,%d,%d] - expected %d, actual %d", kOrV, layer, head, token, j, expectedValue, value); ++numBad; } } EXPECT_EQ(numBad, 0); blockManager.onboardBlock(block2, maxAttentionWindow); EXPECT_TRUE(block2->isPrimary()); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseBlocks(seq2, llmRequest2); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT64) { runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT32) { runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFLOAT) { runPartialCopyTest(); } #ifdef ENABLE_BF16 TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyBF16) { runPartialCopyTest(); } #endif TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyHALF) { runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyBOOL) { runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyUINT8) { runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT8) { runPartialCopyTest(); } #ifdef ENABLE_FP8 TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFP8) { runPartialCopyTest(); } #endif TEST_F(KVCacheManagerTest, BlockManagerTestBlocksPerWindowSize) { auto constexpr numPrimaryBlocks = 16384; // Single window size { std::map> windowSizeToLayers{{1024, {0, 1, 2}}}; auto result = BlockManager::blocksPerWindowSize(numPrimaryBlocks, windowSizeToLayers); EXPECT_EQ(result.size(), 1); EXPECT_EQ(result.at(1024), numPrimaryBlocks); } // Variable window size { std::map> windowSizeToLayers{ {1024, {1}}, // contribution = 1024*1 = 1024 / 29696 {4096, {0, 4, 5}}, // contribution = 4096*3 = 12288 / 29696 {8192, {2, 3}}, // contribution = 8192*2 = 16384 / 29696 }; auto result = BlockManager::blocksPerWindowSize(numPrimaryBlocks, windowSizeToLayers); EXPECT_EQ(result.size(), 3); EXPECT_EQ(std::accumulate(result.begin(), result.end(), 0, [](auto sum, auto cur) { return sum + cur.second; }), numPrimaryBlocks); // Two blocks that were lost due to rounding down were awarded in order to the smallest window sizes: EXPECT_EQ(result.at(1024), 565); // 564 + 1 EXPECT_EQ(result.at(4096), 6780); // 6779 + 1 EXPECT_EQ(result.at(8192), 9039); // 9039 + 0 } } #ifdef ENABLE_FP4 TEST_F(KVCacheManagerTest, FP4BlockScaleManagementTest) { auto constexpr numLayers = 6; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 16; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kFP4, false, stream, true, onboardBlocks); kvCacheManager.allocatePools(/*useUvm=*/false); // We should have one additional pool for the block scales. EXPECT_EQ(kvCacheManager.getBlockManager().getNumPools(), 2); EXPECT_EQ(kvCacheManager.getBlockManager().getNumPools(/*includeBlockScalePools=*/false), 1); EXPECT_NE(kvCacheManager.getBlockScalePoolPointers(), nullptr); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_TRUE(blockManager.containsBlockScales(1)); EXPECT_EQ(blockManager.getBlockSize(0) / 16, blockManager.getBlockSize(1)); } #endif TEST_F(KVCacheManagerTest, BlockManagerReuseTest) { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(9, beamIdx); // block 2 contains [8] llmRequest0->addNewToken(10, beamIdx); // block 2 contains [8, 9] auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 0, 1, 2 are stored for reuse (blocks contain [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]) blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] and then remove it requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(9, beamIdx); // block 3 contains [8] llmRequest1->addNewToken(10, beamIdx); // block 3 contains [8, 9] EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 3 matches block 2 and will be freed (blocks contain [8, 9]) blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] // reuse blocks 0, 1, 2(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8]) :: p = partial reuse auto inputTokens0 = std::make_shared(*inputTokens); inputTokens0->emplace_back(9); llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens0, samplingConfig, isStreaming); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), promptLen0 - 1); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // note that seq0 is holding blocks 0, 1 and 2 until releaseBlocks is called // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] // reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 4 auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); llmRequest1->addNewToken(10, beamIdx); // block 4 contains [8, 9, 10] EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); // block 2 is stored for reuse (block contains [8]). nb! Last token of last block is never stored blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 4 is stored for reuse (block contains [8, 9]). nb! Last token of last block is never stored blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with less tokens // input tokens [0, 1, 2, 3, 4] auto inputLength2 = tokensPerBlock + 1; auto inputTokens2 = std::make_shared(VecTokens{inputTokens->begin(), inputTokens->begin() + inputLength2}); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0 ([0, 1, 2, 3]), get new block 5 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 5})); llmRequest2->addNewToken(5, beamIdx); // block 5 contains [4] numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // add request with more tokens // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11] auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens3, samplingConfig, isStreaming); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8, 9]) auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), numTokens - 1); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); llmRequest3->addNewToken(11, beamIdx); // block 4 contains [8, 9, 11] numTokens = llmRequest3->getNumTokens(beamIdx); // one block used by both seq2 and seq3 numBlocks += tc::ceilDiv(numTokens, tokensPerBlock) - 1; EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 5 is not stored since it is last block and has only one token blockManager.releaseBlocks(seq2, llmRequest2); // block 4 is stored for reuse (block contains [8, 9]). nb! Last token of last block not stored blockManager.releaseBlocks(seq3, llmRequest3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with 11 tokens, then discard few tokens from request and release a shorter one auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12}); auto inputTokens4Short = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); requestId = 4; auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming); numTokens = llmRequest4->getNumTokens(beamIdx); GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8,9]) auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 1); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); auto llmRequest4Short = std::make_shared(requestId, maxNewTokens, inputTokens4Short, samplingConfig, isStreaming); // llmRequest4Short tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] // blocks 0 and 1 ([0, 1, 2, 3], [4, 5, 6, 7]) are already stored, // block 4 is freed blockManager.releaseBlocks(seq4, llmRequest4Short); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with 11 tokens again and make sure no discarded tokens reuse happens // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12] // reuse blocks 0, 1, 2(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8]) // nb! LlmRequest retains state calculated during addSequence, this state affects result. // Calling addSequence a second time with same LlmRequest object will produce incorrect state. // Create new llmRequest4 instance to avoid this issue. llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming); promptLen4 = llmRequest4->getNumTokens(beamIdx); numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 2); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); blockManager.releaseBlocks(seq4, llmRequest4); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with max size with incidental reuse of first token. // this happens because we initialize inputTokens5 with 0's. auto inputLength5 = blocksInPrimaryPool * tokensPerBlock - 1; auto inputTokens5 = std::make_shared(VecTokens(inputLength5, 0)); requestId = 5; auto llmRequest5 = std::make_shared(requestId, maxNewTokens, inputTokens5, samplingConfig, isStreaming); numTokens = llmRequest5->getNumTokens(beamIdx); GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, all blocks need to be freed auto promptLen5 = llmRequest5->getNumTokens(beamIdx); auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock()); blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow); llmRequest5->addNewToken(0, beamIdx); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 1); // incidental reuse EXPECT_EQ(blockManager.getNumAllocatedBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), 0); blockManager.releaseBlocks(seq5, llmRequest5); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with min size that doesn't reuse blocks auto inputLength6 = 1; auto inputTokens6 = std::make_shared(VecTokens(inputLength6, 0)); requestId = 6; auto llmRequest6 = std::make_shared(requestId, maxNewTokens, inputTokens6, samplingConfig, isStreaming); numTokens = llmRequest6->getNumTokens(beamIdx); GenerationRequest seq6{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, all blocks need to be freed auto promptLen6 = llmRequest6->getNumTokens(beamIdx); auto numContextBlocks6 = tc::ceilDiv(promptLen6, blockManager.getTokensPerBlock()); blockManager.addSequence(seq6, promptLen6, numContextBlocks6, *llmRequest6, maxAttentionWindow); llmRequest6->addNewToken(0, beamIdx); // no reuse occurs because we are unable to reuse last input token and inputLength6 == 1. EXPECT_EQ(llmRequest6->getContextCurrentPosition(), 0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - 1); blockManager.releaseBlocks(seq6, llmRequest6); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) { // tc::Logger::getLogger()->setLevel(tc::Logger::Level::DEBUG); using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr numReturnSequences = 1; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // assume prompt id starts from 100 auto inputTokens = std::make_shared(VecTokens{100, 101, 102, 103, 104, 105, 0, 1, 2}); auto inputTokenExtraIds = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 3, 3, 0, 0, 0}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 0, 1, 2 are stored for reuse (block 2 contains [(2, 0), (3, 0)]) blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens and then remove it requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 3 matches block 2 and will be freed blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them // reuse blocks 0, 1 and get new block 4 llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // reuse blocks 0, 1 and reuse block 2 auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); auto inputTokenExtraIds1 = std::make_shared(*inputTokenExtraIds); inputTokenExtraIds1->push_back(0); inputTokenExtraIds1->push_back(0); llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1, numReturnSequences); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 2 is stored for reuse (block contains [(2, 0), (3, 0), (4, 0)]) blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with totally different extra ids auto inputTokenExtraIds2 = std::make_shared(VecTokenExtraIds{4, 4, 5, 5, 6, 6, 0, 0, 0}); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds2, numReturnSequences); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 5, 6, 7 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest2->addNewToken(3, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // add request with partial different extra ids auto inputTokenExtraIds3 = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 4, 4, 0, 0, 0}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds3, numReturnSequences); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0, get new block 8, 9 auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8, 9})); llmRequest3->addNewToken(3, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseBlocks(seq3, llmRequest3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) { // tc::Logger::getLogger()->setLevel(tc::Logger::Level::DEBUG); using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // loraTaskId is 0 for common cases LlmRequest::LoraTaskIdType loraTaskId{0}; auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); // get new blocks 0, 1, 2 ([0,1,2,3], [4,5,6,7], [8]) blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(9, beamIdx); llmRequest0->addNewToken(10, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store blocks 0, 1, 2 for reuse ([0,1,2,3], [4,5,6,7], [8,9]) blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens and loraTaskId, then remove it // inputTokens = (0, 1, 2, 3, 4, 5, 6, 7, 8) requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(9, beamIdx); llmRequest1->addNewToken(10, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store block 3 for reuse ([8,9]) blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them // inputTokens = (0, 1, 2, 3, 4, 5, 6, 7, 8) // reuse blocks 0, 1 and get new block 4 llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); // nb! addNewToken adds new generated token, number of input tokens stay the same. // calling addNewToken before addSequence potentially triggers this error message: // Assertion failed: prepopulatedPromptLen < promptLen // because maximum value for prepopulatedPromptLen is number of input+output tokens - 1, // but promptLen is number of input tokens. llmRequest0->addNewToken(9, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // inputTokens1 = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); // reuse 0, 1, 2(p) ([0,1,2,3], [4,5,6,7], [8]) blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(10, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); // store block 4 for reuse ([8]) blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 2 is stored for reuse (block contains [8, 9]). nb! Last token of last block is not stored blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 1 loraTaskId = static_cast(1); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 5, 6, 7 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); // no reuse expected. Input tokens match blocks 0 and 1, but lora task id differs. EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest2->addNewToken(9, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store blocks 5, 6, 7 for reuse ([0,1,2,3], [4,5,6,7], [8]) with loraTaskId 1 blockManager.releaseBlocks(seq2, llmRequest2); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 1 and more tokens auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens3, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 5, 6, 7(p) ([0,1,2,3], [4,5,6,7], [8]) auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), promptLen3 - 2); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest3->addNewToken(11, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store block 7 for reuse ([8,9]) with loraTaskId 1 blockManager.releaseBlocks(seq3, llmRequest3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 0 again but with less tokens loraTaskId = static_cast(0); auto inputLength4 = 5; auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 4}); requestId = 4; auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest4->getNumTokens(beamIdx); GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, get new block 8 auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8})); llmRequest4->addNewToken(5, beamIdx); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 8 is stored with [4] and loraTaskId 0 blockManager.releaseBlocks(seq4, llmRequest4); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); // add request with same tokens as request0 but without loraTaskId requestId = 5; auto llmRequest5 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt); numTokens = llmRequest5->getNumTokens(beamIdx); GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 9, 10, 11 auto promptLen5 = llmRequest5->getNumTokens(beamIdx); auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock()); blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0); EXPECT_THAT(seq5.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({9, 10, 11})); llmRequest5->addNewToken(9, beamIdx); numTokens = llmRequest5->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 9, 10, 11 are stored without loraTaskId blockManager.releaseBlocks(seq5, llmRequest5); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) { // tc::Logger::getLogger()->setLevel(tc::Logger::Level::DEBUG); using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // assume prompt id starts from 100 auto inputTokens = std::make_shared(VecTokens{100, 101, 102, 103, 104, 105, 0, 1, 2}); auto inputTokenExtraIds = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 3, 3, 0, 0, 0}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::LoraTaskIdType loraTaskId1{1}; LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 1 and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 0, 1, 2 are stored for reuse (block 2 contains [(2, 0), (3, 0)] with loraTaskId 1) blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens but different loraTaskId and then remove it requestId = 1; LlmRequest::LoraTaskIdType loraTaskId2 = static_cast(2); auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 3, 4, 5 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 3, 4, 5 are stored for reuse (block 5 contains [(2, 0), (3, 0)] with loraTaskId 2) blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); // reuse blocks 0, 1 and get new block 6 blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // reuse blocks 3, 4 and reuse block 5 auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); auto inputTokenExtraIds1 = std::make_shared(*inputTokenExtraIds); inputTokenExtraIds1->push_back(0); inputTokenExtraIds1->push_back(0); llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); blockManager.releaseBlocks(seq0, llmRequest0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); blockManager.releaseBlocks(seq1, llmRequest1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with totally different extra ids and loraTaskId 1 auto inputTokenExtraIds2 = std::make_shared(VecTokenExtraIds{4, 4, 5, 5, 6, 6, 0, 0, 0}); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds2); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 7, 8, 9 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({7, 8, 9})); llmRequest2->addNewToken(3, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // add request with partial different extra ids and loraTaskId 1 auto inputTokenExtraIds3 = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 4, 4, 0, 0, 0}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds3); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0, get new block 10, 11 auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 10, 11})); llmRequest3->addNewToken(3, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); /////////////////////////////////////////////////////////////////////////// // add request with partial different extra ids and loraTaskId 2 requestId = 4; auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds3); numTokens = llmRequest4->getNumTokens(beamIdx); GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 3, get new block 12, 13 auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 12, 13})); llmRequest4->addNewToken(3, beamIdx); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 3); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 3); blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseBlocks(seq4, llmRequest4); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, KVCacheManagerPerRequestStatsTest) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); // Add the sequence to req0 EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest0)); // After first addition, check allocations and reuses auto numBlocks = tc::ceilDiv(inputLength, tokensPerBlock); EXPECT_EQ(llmRequest0->getReusedBlocksPerRequest(), 0); EXPECT_EQ(llmRequest0->getAllocTotalBlocksPerRequest(), numBlocks); EXPECT_EQ(llmRequest0->getAllocNewBlocksPerRequest(), numBlocks); EXPECT_EQ(llmRequest0->getMissedBlocksPerRequest(), numBlocks); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest0)); requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest1)); auto const numSharedBlocks = inputLength / tokensPerBlock; EXPECT_EQ(llmRequest1->getReusedBlocksPerRequest(), numSharedBlocks); EXPECT_EQ(llmRequest1->getAllocTotalBlocksPerRequest(), numBlocks - numSharedBlocks); EXPECT_EQ(llmRequest1->getAllocNewBlocksPerRequest(), numBlocks - numSharedBlocks); EXPECT_EQ(llmRequest1->getMissedBlocksPerRequest(), numBlocks - numSharedBlocks); EXPECT_EQ(llmRequest1->getKVCacheHitRatePerRequest(), static_cast(numSharedBlocks) / static_cast(numBlocks)); } TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest) { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 4; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // Add a request at a high and very low priority auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, 4, 90), KvCacheRetentionConfig::TokenRangeRetentionConfig(4, 8, 10)}, 20)); GenerationRequest seq0{0, inputLength0, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks0 = tc::ceilDiv(inputLength0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, llmRequest0->getNumTokens(0), numContextBlocks0, *llmRequest0, maxAttentionWindow); // Add another sequence with different tokens, at a low priority auto inputTokens1 = std::make_shared(VecTokens{8, 9, 10, 11, 12, 13, 14, 15}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); GenerationRequest seq1{1, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks1 = tc::ceilDiv(inputLength1, blockManager.getTokensPerBlock()); blockManager.addSequence(seq1, llmRequest1->getNumTokens(0), numContextBlocks1, *llmRequest1, maxAttentionWindow); // Release both sequences blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseBlocks(seq1, llmRequest1); // Add and then release another sequence auto inputTokens2 = std::make_shared(VecTokens{16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); llmRequest2->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 20)}, 20)); GenerationRequest seq2{2, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks2 = tc::ceilDiv(inputLength2, blockManager.getTokensPerBlock()); blockManager.addSequence(seq2, llmRequest2->getNumTokens(0), numContextBlocks2, *llmRequest2, maxAttentionWindow); blockManager.releaseBlocks(seq2, llmRequest2); // Check that request 1 blocks were overwritten auto inputTokens3 = std::make_shared(VecTokens{8, 9, 10, 11, 12, 13, 14, 15}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); GenerationRequest seq3{3, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks3 = tc::ceilDiv(inputLength3, blockManager.getTokensPerBlock()); blockManager.addSequence(seq3, llmRequest3->getNumTokens(0), numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 4); blockManager.releaseBlocks(seq3, llmRequest3); EXPECT_EQ(blockManager.getNumFreeBlocks(), 4); // Check that request 0 blocks weren't overwritten auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto const inputLength4 = static_cast(inputTokens4->size()); auto llmRequest4 = std::make_shared(4, maxNewTokens, inputTokens4, samplingConfig, isStreaming); GenerationRequest seq4{4, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks4 = tc::ceilDiv(inputLength4, blockManager.getTokensPerBlock()); blockManager.addSequence(seq4, llmRequest4->getNumTokens(0), numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), 4); // Check that request 2 block 0 has been evicted auto inputTokens5 = std::make_shared(VecTokens{16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength5 = static_cast(inputTokens5->size()); auto llmRequest5 = std::make_shared(5, maxNewTokens, inputTokens5, samplingConfig, isStreaming); GenerationRequest seq5{5, inputLength5, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks5 = tc::ceilDiv(inputLength5, blockManager.getTokensPerBlock()); blockManager.addSequence(seq5, llmRequest5->getNumTokens(0), numContextBlocks5, *llmRequest5, maxAttentionWindow); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0); } TEST_F(KVCacheManagerTest, KVCacheManagerDecodeBlockPriorityTest) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 8; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 8); // Uses 3 blocks 0, 1, 2 which contain [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10] auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 5)}, 90)); kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); // 5 blocks available. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 5); llmRequest0->addNewToken(0, 0); // block 2 contains [8, 9, 10, 11] // Add a token to request 0, which occupies a new block 3. kvCacheManager.addToken(0); llmRequest0->addNewToken(0, 0); // block 3 contains [0] // 4 blocks left. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 4); // uses up 3 more blocks 4, 5, 6. [12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22] auto inputTokens1 = std::make_shared(VecTokens{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); llmRequest1->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 90)}, 5)); kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); // one block left. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 1); llmRequest1->addNewToken(0, 0); // block 6 contains [20, 21, 22, 23] // add another token, which occupies another new block kvCacheManager.addToken(1); llmRequest1->addNewToken(0, 0); // block 7 contains [0] // no block available. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 0); // remove both sequences, blocks get stored // leaf block 3 (priority 90), context blocks 2, 1, 0 (priority 5) kvCacheManager.removeSequence(0, llmRequest0); // leaf block 7 (priority 5), context blocks 6, 5, 4 (priority 90) kvCacheManager.removeSequence(1, llmRequest1); // all blocks are available again. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 8); // no reuse, blocks are evicted by new request: // evict block 7 (lowest priority leaf), block 6 becomes leaf // evict block 3 (lowest and oldest priority leaf), block 2 becomes leaf // evict block 2 (lowest and oldest priority leaf), block 1 becomes leaf // uses up 3 blocks 7, 3, 2. [24, 25, 26, 27], [28, 29, 30, 31], [32, 33, 34] auto inputTokens2 = std::make_shared(VecTokens{24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); // leaf block 2 (priority 35), context blocks 3, 7 (priority 35) kvCacheManager.removeSequence(2, llmRequest2); // reuse blocks 0 and 1, new block 2 (lowest priority leaf) // Uses 3 blocks 0, 1, 2 which contain [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10] auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); // Two blocks reused EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 8); } TEST_F(KVCacheManagerTest, KVCacheManagerTimedEvictionTest) { using namespace tensorrt_llm::executor; using namespace std::chrono_literals; auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 80, 10ms)}, 80)); kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); kvCacheManager.removeSequence(0, llmRequest0); auto inputTokens1 = std::make_shared(VecTokens{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); llmRequest1->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 50)}, 80)); kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); kvCacheManager.removeSequence(1, llmRequest1); std::this_thread::sleep_for(std::chrono::milliseconds(50)); // Manually trigger a refresh. kvCacheManager.refreshBlocks(); // Clear out some of the blocks. auto inputTokens2 = std::make_shared(VecTokens{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); kvCacheManager.removeSequence(2, llmRequest2); // Check that the [12, 13, 14, 15] block is still in the cache auto inputTokens3 = std::make_shared(VecTokens{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 11); } TEST_F(KVCacheManagerTest, KVCacheManagerDecodeTimedEvictionTest) { using namespace tensorrt_llm::executor; using namespace std::chrono_literals; auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks); kvCacheManager.allocatePools(false); { auto inputTokens0 = std::make_shared(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 50)}, 50)); // Set all blocks to priority 50. kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); for (int i = 0; i < 3; i++) { kvCacheManager.addToken(0); llmRequest0->addNewToken(0, 0); } kvCacheManager.removeSequence(0, llmRequest0); } { auto inputTokens1 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); llmRequest1->setKvCacheRetentionConfig(KvCacheRetentionConfig( {}, KvCacheRetentionConfig::kMaxRetentionPriority, 20ms)); // Set decode blocks to max priority for 20ms. kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); kvCacheManager.storeContextBlocks(*llmRequest1); for (int i = 0; i < 3; i++) { kvCacheManager.addToken(1); llmRequest1->addNewToken(0, 0); } kvCacheManager.removeSequence(1, llmRequest1); } std::this_thread::sleep_for(std::chrono::milliseconds(50)); kvCacheManager.refreshBlocks(); auto inputTokens2 = std::make_shared(VecTokens{0, 0, 0, 0, 0, 0, 0, 0}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); kvCacheManager.removeSequence(2, llmRequest2); auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 8); } TEST_F(KVCacheManagerTest, KVCacheManagerSecondaryBlockPrimaryChildTest) { // It's possible for a block in secondary memory to have a primary child. Make sure this case is handled. auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 4; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, false, stream, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); // get new blocks 0, 1, 2 kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); kvCacheManager.removeSequence(0, llmRequest0); // store blocks 0, 1, 2 for reuse ([0,1,2,3], [4,5,6,7], [8,9,10]) // Offload the last two blocks of llmRequest0 to secondary memory auto inputTokens1 = std::make_shared(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); // get blocks 3, 2, 1. This causes 2 and 1 to be offloaded to secondary kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); kvCacheManager.removeSequence(1, llmRequest1); // store blocks 3, 2, 1 for reuse ([1,1,2,3], [4,5,6,7], [8,9,10]) // Match the middle block of request 0 auto inputTokens2 = std::make_shared(VecTokens{0, 1, 2, 3}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); // reuse block 0 kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 3); kvCacheManager.storeContextBlocks(*llmRequest2); // Add a decode block that matches the contents of seq 0 block 1, add a unique decode block for (int token = 4; token < 8; token++) { llmRequest2->addNewToken(token, 0); kvCacheManager.addToken(2); } llmRequest2->addNewToken(0, 0); kvCacheManager.addToken(2); llmRequest2->addNewToken(0, 0); kvCacheManager.addToken(2); // The middle block remains in secondary, but the third block is in primary kvCacheManager.removeSequence(2, llmRequest2); auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 0, 0}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); // All blocks should be reused. EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 9); } TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockTest) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, false, stream, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); llmRequest0->addNewToken(0, 0); kvCacheManager.addToken(0); // The second block allocated should be first in line for eviction. kvCacheManager.removeSequence(0, llmRequest0); auto inputTokens1 = std::make_shared(VecTokens{1, 1, 2, 3}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); GenerationRequest const& seq1 = kvCacheManager.getSequence(1); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0); // Block 1 should NOT be reused. It was not freed even if partial. EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(0), ::testing::ElementsAreArray({2})); // Allocate the remaining 3 blocks in primary auto inputTokens2 = std::make_shared(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); kvCacheManager.removeSequence(1, llmRequest1); kvCacheManager.removeSequence(2, llmRequest2); auto inputTokens3 = std::make_shared(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 11); EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 1); kvCacheManager.addToken(3); llmRequest3->addNewToken(0, 0); EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 0); auto inputTokens4 = std::make_shared(VecTokens{42}); auto const inputLength4 = static_cast(inputTokens4->size()); auto llmRequest4 = std::make_shared(4, maxNewTokens, inputTokens4, samplingConfig, isStreaming); EXPECT_THROW(kvCacheManager.addSequence(4, inputLength4, beamWidth, llmRequest4), std::exception); } TEST_P(KVCacheManagerTest, DISABLED_KVCacheManagerAllocationTest) { using DType = half; auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 4; auto constexpr sinkTokenLength = 0; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr useUvm = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const granularity = tensorrt_llm::common::getAllocationGranularity(); KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); auto const blockManager = kvCacheManager.getBlockManager(); auto const& bufferManager = blockManager.getBufferManager(theOnlyWindowSize(kvCacheManager)); auto const memoryPoolUsedBefore = bufferManager.memoryPoolUsed(); kvCacheManager.allocatePools(useUvm); auto const memoryPoolUsedAfter = bufferManager.memoryPoolUsed(); EXPECT_GT(memoryPoolUsedAfter, memoryPoolUsedBefore); auto const actualPoolMemoryUsageDiff = static_cast(memoryPoolUsedAfter - memoryPoolUsedBefore); auto const expectedPoolMemoryUsageDiff = tensorrt_llm::common::roundUp( sizeof(DType) * static_cast(totalNumBlocks) * numLayers * 2 * numHeads * tokensPerBlock * sizePerHead, static_cast(granularity)); EXPECT_EQ(actualPoolMemoryUsageDiff, expectedPoolMemoryUsageDiff); } TEST_P(KVCacheManagerTest, KVCacheManagerTest) { using DType = half; using SizeType32 = KVCacheManager::SizeType32; auto constexpr numLayers = 4; auto constexpr numHeads = 2; // heterogeneous layers std::vector const numHeadsPerLayer({3, 2, 1, 2}); std::map const expectedHeadsPerPool({{0, 1}, {1, 2}, {2, 3}}); std::map const expectedLayersPerPool({{0, 1}, {1, 2}, {2, 1}}); auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 4; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr requestId = 7; auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const expectedNumPools = homogeneousLayers ? 1 : static_cast(expectedHeadsPerPool.size()); KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); tr::ITensor::SharedPtr kvCacheBlockOffsets = tr::BufferManager::cpu( tr::ITensor::makeShape({expectedNumPools, maxNumSequences * maxBeamWidth, 2, maxBlocksPerSeq}), tr::TRTDataType::value); auto kvCacheBlockOffsetsRange = tensorrt_llm::runtime::BufferRange(*kvCacheBlockOffsets); std::fill(kvCacheBlockOffsetsRange.begin(), kvCacheBlockOffsetsRange.end(), tk::KVCacheIndex{std::numeric_limits::max()}); kvCacheManager.copyBlockOffsets(*kvCacheBlockOffsets, 0, requestId); EXPECT_EQ(blockManager.getNumPools(), expectedNumPools); if (homogeneousLayers) { EXPECT_EQ(blockManager.getBlockSize(0), numHeads * sizePerHead * tokensPerBlock); } else { for (auto layerIdx = 0; layerIdx < numLayers; layerIdx++) { EXPECT_EQ(blockManager.getBlockSize(blockManager.getLayerPoolIdx(layerIdx)), numHeadsPerLayer.at(layerIdx) * sizePerHead * tokensPerBlock); } } { for (auto poolIdx = 0; poolIdx < blockManager.getNumPools(); poolIdx++) { auto const numLayersInPool = homogeneousLayers ? numLayers : expectedLayersPerPool.at(poolIdx); auto const offsetBetweenBlocks = numLayersInPool * 2; tr::ITensor::SharedPtr blockOffsetsSlice = tr::ITensor::slice(tr::ITensor::at(kvCacheBlockOffsets, {poolIdx}), 0, maxBeamWidth); auto blockOffsetsShape = blockOffsetsSlice->getShape(); auto* const blockOffsetsPtr = tr::bufferCast(*blockOffsetsSlice); tk::KVCacheIndex::UnderlyingType runningSum{0}; for (auto block = 0; block < numSharedBlocks; ++block) { for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; } runningSum += offsetBetweenBlocks; } { auto const block = numSharedBlocks; for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; runningSum += offsetBetweenBlocks; } } { auto const block = numSharedBlocks + 1; auto const expected = std::numeric_limits::max(); for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), expected) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), expected) << "beam:" << beam << " block:" << block; } } } } EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numBlocksPerSeq); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); auto currentNumBlocks = totalNumBlocks; for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); currentNumBlocks -= numSharedBlocks + maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); currentNumBlocks -= maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } EXPECT_EQ(blockManager.getNumFreeBlocks(), 0); } TEST_P(KVCacheManagerTest, KVCacheManagerRewindTokensTest) { using DType = half; auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr requestId = 7; auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numBlocksPerSeq); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numBlocksPerSeq); EXPECT_NO_THROW(kvCacheManager.rewindKVCache(requestId, 4)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); auto currentNumBlocks = totalNumBlocks; for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); currentNumBlocks -= numSharedBlocks + maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); currentNumBlocks -= maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.rewindKVCache(requestId, 2)); currentNumBlocks += maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } } TEST_P(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowTest) { using DType = half; using SizeType32 = KVCacheManager::SizeType32; auto constexpr numLayers = 4; auto constexpr numHeads = 2; // heterogeneous layers std::vector const numHeadsPerLayer({3, 2, 1, 2}); std::map const expectedHeadsPerPool({{0, 1}, {1, 2}, {2, 3}}); std::map const expectedLayersPerPool({{0, 1}, {1, 2}, {2, 1}}); auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 64; auto constexpr blockLengthPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr requestId = 7; auto constexpr maxNumTokens = tokensPerBlock * blockLengthPerSeq; auto constexpr inputLength = maxNumTokens - tokensPerBlock - 1; // Enable cyclic kv cache for all new generated tokens. auto constexpr maxAttentionWindow = inputLength; auto constexpr numSharedBlocks = std::min(inputLength, maxAttentionWindow) / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (blockLengthPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock); auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const expectedNumPools = homogeneousLayers ? 1 : static_cast(expectedHeadsPerPool.size()); KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); tr::ITensor::SharedPtr const kvCacheBlockOffsets = tr::BufferManager::cpu( tr::ITensor::makeShape({expectedNumPools, maxNumSequences * maxBeamWidth, 2, maxBlocksPerSeq}), tr::TRTDataType::value); kvCacheManager.copyBlockOffsets(*kvCacheBlockOffsets, 0, requestId); EXPECT_EQ(blockManager.getNumPools(), expectedNumPools); if (homogeneousLayers) { EXPECT_EQ(blockManager.getBlockSize(0), numHeads * sizePerHead * tokensPerBlock); } else { for (auto layerIdx = 0; layerIdx < numLayers; layerIdx++) { EXPECT_EQ(blockManager.getBlockSize(blockManager.getLayerPoolIdx(layerIdx)), numHeadsPerLayer.at(layerIdx) * sizePerHead * tokensPerBlock); } } for (auto poolIdx = 0; poolIdx < blockManager.getNumPools(); poolIdx++) { auto const numLayersInPool = homogeneousLayers ? numLayers : expectedLayersPerPool.at(poolIdx); auto const offsetBetweenBlocks = numLayersInPool * 2; tr::ITensor::SharedPtr const blockOffsetsSlice = tr::ITensor::slice(tr::ITensor::at(kvCacheBlockOffsets, {poolIdx}), 0, maxBeamWidth); auto blockOffsetsShape = blockOffsetsSlice->getShape(); auto* const blockOffsetsPtr = tr::bufferCast(*blockOffsetsSlice); tk::KVCacheIndex::UnderlyingType runningSum{0}; for (auto block = 0; block < numSharedBlocks; ++block) { for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; } runningSum += offsetBetweenBlocks; } { auto const block = numSharedBlocks; for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; runningSum += offsetBetweenBlocks; } } } EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numBlocksPerSeq + 1); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numBlocksPerSeq + 1); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); auto currentNumBlocks = totalNumBlocks; for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); currentNumBlocks -= numSharedBlocks + maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } EXPECT_EQ(blockManager.getNumFreeBlocks(), maxNumSequences); } TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) { auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 4; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); // Enable cyclic kv cache for long input tokens. auto constexpr maxAttentionWindow = 16; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock); auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = true; auto constexpr onboardBlocks = true; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); SizeType32 constexpr maxNewTokens = 4; // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = maxBeamWidth; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; SizeType32 requestId = 0; int inputLength = 16; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); auto constexpr beamIdx = 0; /////////////////////////////////////////////////////////////////////////// // add a long request and then remove it kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3})); // add tokens to enable cyclic kv cache llmRequest->addNewToken(1016, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1017, beamIdx); kvCacheManager.addToken(requestId); auto numTokens = llmRequest->getNumTokens(beamIdx); auto numBlocks = seq0.getCacheBlockIds(maxAttentionWindow)[beamIdx].size(); EXPECT_EQ(numBlocks, maxBlocksPerSeq); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // no blocks stored because cyclic KV cache was enabled EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add a short request and then remove it requestId = 1; inputLength = 7; inputTokens->resize(inputLength); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5})); llmRequest->addNewToken(1007, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); numTokens = llmRequest->getNumTokens(beamIdx); numBlocks = seq1.getCacheBlockIds(maxAttentionWindow)[beamIdx].size(); EXPECT_EQ(numBlocks, 3); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // store blocks 4, 5 for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007]) /////////////////////////////////////////////////////////////////////////// // add a medium request and then remove it // reuse first 2 blocks {4, 5} in previous request, and get new block 7 requestId = 2; inputLength = 10; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7})); numTokens = llmRequest->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // store block 7 for reuse ([1008]) /////////////////////////////////////////////////////////////////////////// // add a longer request within attention window and try to reuse // reuse blocks 4, 5, 7(p) and get new block 8 // upon reaching the attention window, the block ids shouldn't change requestId = 3; inputLength = 15; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq3 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 9); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8})); llmRequest->addNewToken(1015, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1016, beamIdx); kvCacheManager.addToken(requestId); // FIXME: This means that reuse will break here - the window will start writing to a reused block, and the following // sequence that tries to reuse the block will read garbage. This will be fixed by removing the cyclic kv cache. EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 7, 8})); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); /////////////////////////////////////////////////////////////////////////// // add a long request that exceeded attention window, no reuse requestId = 4; inputLength = 20; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); GenerationRequest const& seq4 = kvCacheManager.getSequence(requestId); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({9, 10, 11, 12})); } TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) { auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 4; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); // Enable cyclic kv cache for long input tokens. auto constexpr minAttentionWindow = 8; auto constexpr maxAttentionWindow = 16; auto const maxAttentionWindowVec = std::vector{maxAttentionWindow, minAttentionWindow}; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxAttentionWindow, tokensPerBlock); auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = true; auto constexpr onboardBlocks = true; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, std::nullopt, dtype, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); auto const allBlocksInPrimaryPools = blockManager.getNumPrimaryBlocks(); EXPECT_THAT(allBlocksInPrimaryPools, blocksInPrimaryPool); ASSERT_EQ(blockManager.isVariableWindow(), true); ASSERT_EQ(blockManager.isVariableGQA(), false); SizeType32 constexpr maxNewTokens = 4; // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = maxBeamWidth; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; SizeType32 requestId = 0; int inputLength = 7; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); auto constexpr beamIdx = 0; /////////////////////////////////////////////////////////////////////////// // add a request that will exceed the min attention window *after context*, and then remove it. kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); auto const assertBlocks = [minAttentionWindow, maxAttentionWindow, beamIdx](GenerationRequest seq, std::initializer_list expectedBlocksMin, std::initializer_list expectedBlocksMax) { auto blocksMin = seq.getCacheBlockIds(minAttentionWindow).at(beamIdx); auto blocksMax = seq.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(blocksMin, ::testing::ElementsAreArray(expectedBlocksMin)); EXPECT_THAT(blocksMax, ::testing::ElementsAreArray(expectedBlocksMax)); return blocksMin.size() + blocksMax.size(); }; assertBlocks(seq0, {0, 1}, {0, 1}); // add tokens to enable cyclic kv cache for minimum but not maximum llmRequest->addNewToken(1016, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1017, beamIdx); kvCacheManager.addToken(requestId); auto const numBlocks = assertBlocks(seq0, {0, 1}, {0, 1, 2}); EXPECT_EQ(blockManager.getNumFreeBlocks(), allBlocksInPrimaryPools - numBlocks); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // no blocks stored because cyclic KV cache was enabled EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), allBlocksInPrimaryPools); /////////////////////////////////////////////////////////////////////////// // add a short request that is between the min and max attention window requestId = 1; inputLength = 9; inputTokens->resize(inputLength); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); assertBlocks(seq1, {2, 3}, {3, 4, 5}); llmRequest->addNewToken(1007, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); assertBlocks(seq1, {2, 3}, {3, 4, 5}); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); /////////////////////////////////////////////////////////////////////////// // add a request that won't reach the min attention window, so a block can be reused. requestId = 2; inputLength = 4; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); assertBlocks(seq2, {4}, {6}); auto const numTokens = llmRequest->getNumTokens(beamIdx); EXPECT_EQ(tc::ceilDiv(numTokens, tokensPerBlock), 1); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // store block 6 for reuse /////////////////////////////////////////////////////////////////////////// // add a request that won't reach the min attention window, so a block 6 from previous request will be reused. requestId = 3; inputLength = 4; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq3 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 3); assertBlocks(seq3, {4}, {6}); } namespace { KVCacheManager setupKvCacheManagerForHashTest(bool enableBlockReuse) { auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 4; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr maxBlocksPerSeq = 8; auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; return KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, CacheType::kSELF, std::nullopt, nullptr, /*enableHashKey*/ true); } std::vector getHashAndRetrieveBlocksByHashTest( BlockManager const& blockManager, std::vector const& blockIds, SizeType32 windowSize) { std::vector blockHashes; for (auto blockId : blockIds) { blockHashes.emplace_back(blockManager.getBlockById(blockId, windowSize)->getHash()); } std::vector blockPtrs; for (auto hash : blockHashes) { auto range = blockManager.getBlocksByHash(hash, windowSize); BlockPtr const prevBlock = blockPtrs.empty() ? nullptr : blockPtrs.back(); BlockPtr thisBlock = nullptr; for (auto it = range.first; it != range.second; ++it) { if (it->second->getPrevBlockInSeq() == prevBlock) { thisBlock = it->second; break; } } EXPECT_NE(thisBlock, nullptr); blockPtrs.emplace_back(thisBlock); } EXPECT_EQ(blockHashes.size(), blockPtrs.size()); for (size_t i = 0; i < blockHashes.size(); i++) { EXPECT_EQ(blockManager.getBlockById(blockIds[i], windowSize), blockPtrs[i]); } return blockHashes; } } // namespace TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyTest) { auto kvCacheManager = setupKvCacheManagerForHashTest(false); auto const& blockManager = kvCacheManager.getBlockManager(); SizeType32 constexpr maxNewTokens = 4; // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; SizeType32 requestId = 0; int inputLength = 16; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); auto constexpr beamIdx = 0; /////////////////////////////////////////////////////////////////////////// // add a request and then remove it without reuse kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); auto& blockIds = seq.getCacheBlockIds(onlyWindowSize).at(beamIdx); EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2, 3})); // get blocks by hash and try to retrieve them by hash auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds, onlyWindowSize); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // blocks are all removed for (auto hash : blockHashes) { auto range = blockManager.getBlocksByHash(hash, onlyWindowSize); EXPECT_EQ(range.first, range.second); } EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); } TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest) { auto kvCacheManager = setupKvCacheManagerForHashTest(true); auto const& blockManager = kvCacheManager.getBlockManager(); SizeType32 constexpr maxNewTokens = 4; // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = 1; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; SizeType32 requestId = 0; int inputLength = 16; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); auto constexpr beamIdx = 0; /////////////////////////////////////////////////////////////////////////// // add a request and then remove it with reuse kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); EXPECT_EQ(blockManager.getNumPools(), 1); auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); auto& blockIds0 = seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx); EXPECT_THAT(blockIds0, ::testing::ElementsAreArray({0, 1, 2, 3})); // get blocks by hash and try to retrieve them by hash auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds0, onlyWindowSize); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest)); // TODO: Make reused blocks accessible by hash, after sequence removed. Test here. /////////////////////////////////////////////////////////////////////////// // add a new request with same prefix requestId = 1; inputLength = 20; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 15); auto& blockIds1 = seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx); EXPECT_THAT(blockIds1, ::testing::ElementsAreArray({0, 1, 2, 3, 4})); std::ignore = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds1, onlyWindowSize); // blocks are reused, so reused blocks are still accessible by previous hashes for (size_t i = 0; i < 4; i++) { auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize); EXPECT_NE(range.first, range.second); } // evicted block is not accessible { size_t i = 4; auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize); EXPECT_EQ(range.first, range.second); } EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 5); } TEST_F(KVCacheManagerTest, KVCacheManagerEventStream) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 2; auto constexpr onboardBlocks = true; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1024)); kvCacheManager.allocatePools(false); auto events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_THAT(std::get(events.front().data).numBlocksPerCacheLevel, ::testing::ElementsAreArray({8, 2})); EXPECT_EQ(getEvents(kvCacheManager).size(), 0); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); llmRequest0->setLoraTaskId(42); kvCacheManager.addSequence(0, inputTokens0->size(), beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get(events.front().data).parentHash, std::nullopt); EXPECT_EQ(std::get(events.front().data).blocks.size(), 2); EXPECT_EQ(std::get(events.front().data).blocks[0].loraId, 42); EXPECT_EQ(std::get(events.front().data).blocks[0].cacheLevel, 0); kvCacheManager.addToken(0); llmRequest0->addNewToken(0, 0); kvCacheManager.removeSequence(0, llmRequest0); auto newEvents = getEvents(kvCacheManager); EXPECT_EQ(newEvents.size(), 1); EXPECT_EQ(newEvents.front().eventId, events.front().eventId + 1); EXPECT_TRUE(std::holds_alternative(newEvents.front().data)); // Check that it provides the link to the parent, and that it only stores the one block. auto storedEventData = std::get(newEvents.front().data); EXPECT_EQ( storedEventData.parentHash, std::get(events.front().data).blocks.back().blockHash); EXPECT_EQ(storedEventData.blocks.size(), 1); // Store with the same tokens auto inputTokens1 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto llmRequest1 = std::make_shared(1, 0, inputTokens1, samplingConfig, true); llmRequest1->setLoraTaskId(42); kvCacheManager.addSequence(1, inputTokens1->size(), beamWidth, llmRequest1); kvCacheManager.storeContextBlocks(*llmRequest1); kvCacheManager.removeSequence(1, llmRequest1); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 0); auto inputTokens2 = std::make_shared(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); auto llmRequest2 = std::make_shared(2, 0, inputTokens2, samplingConfig, true); kvCacheManager.addSequence(2, inputTokens2->size(), beamWidth, llmRequest2); auto inputTokens3 = std::make_shared(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); auto llmRequest3 = std::make_shared(3, 0, inputTokens3, samplingConfig, true); kvCacheManager.addSequence(3, inputTokens3->size(), beamWidth, llmRequest3); events = getEvents(kvCacheManager); size_t firstSwapped = std::get(events.front().data).blockHash; // 3 blocks swapped to secondary EXPECT_EQ(events.size(), 4); for (int i = 2; i >= 0; i--) { EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get(events.front().data).cacheLevel->oldValue, 0); EXPECT_EQ(std::get(events.front().data).cacheLevel->newValue, 1); events.pop_front(); } // The first block swapped to secondary gets evicted. EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_THAT(std::get(events.front().data).blockHashes, ::testing::ElementsAreArray({firstSwapped})); kvCacheManager.removeSequence(2, llmRequest2); kvCacheManager.removeSequence(3, llmRequest3); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 2); for (int i = 0; i < 2; i++) { EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get(events.front().data).parentHash, std::nullopt); EXPECT_EQ(std::get(events.front().data).blocks.size(), 3); events.pop_front(); } auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 1, 1, 1, 1, 0}); auto llmRequest4 = std::make_shared(4, 0, inputTokens4, samplingConfig, true); llmRequest4->setLoraTaskId(42); kvCacheManager.addSequence(4, inputTokens4->size(), beamWidth, llmRequest4); events = getEvents(kvCacheManager); // Offload 1 block, onboard 1 block, remove 1 secondary block. The second new block allocated was never stored, so // it doesn't create a removed event auto onboardedBlocks = 0; auto offloadedBlocks = 0; auto removedBlocks = 0; EXPECT_EQ(events.size(), 3); for (int i = 0; i < 3; i++) { if (std::holds_alternative(events.front().data)) { if (std::get(events.front().data).cacheLevel->oldValue == 0) offloadedBlocks++; else onboardedBlocks++; } else if (std::holds_alternative(events.front().data)) removedBlocks++; else FAIL(); events.pop_front(); } EXPECT_EQ(onboardedBlocks, 1); EXPECT_EQ(offloadedBlocks, 1); EXPECT_EQ(removedBlocks, 1); kvCacheManager.storeContextBlocks(*llmRequest4); events = getEvents(kvCacheManager); EXPECT_TRUE(std::holds_alternative(events.front().data)); } TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamOverflow) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 2; auto constexpr onboardBlocks = true; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1)); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); llmRequest0->setLoraTaskId(42); kvCacheManager.addSequence(0, inputTokens0->size(), beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); auto events = getEvents(kvCacheManager); // The 'created' event should be evicted. EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get(events.front().data).parentHash, std::nullopt); kvCacheManager.addToken(0); llmRequest0->addNewToken(0, 0); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 0); kvCacheManager.removeSequence(0, llmRequest0); events = getEvents(kvCacheManager); // The decode block is stored, and linked to the parent. EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_NE(std::get(events.front().data).parentHash, std::nullopt); } TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamPriority) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 2; auto constexpr onboardBlocks = true; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1024)); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); llmRequest0->setKvCacheRetentionConfig(tle::KvCacheRetentionConfig( std::vector{tle::KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 50)}, 35)); kvCacheManager.addSequence(0, inputTokens0->size(), beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); kvCacheManager.removeSequence(0, llmRequest0); auto events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 3); events.pop_front(); // The `CREATED` event for (int i = 0; i < 2; i++) { EXPECT_EQ(std::get(events.front().data).blocks.front().priority, 50); events.pop_front(); } auto inputTokens1 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto llmRequest1 = std::make_shared(1, 0, inputTokens1, samplingConfig, true); kvCacheManager.addSequence(1, inputTokens1->size(), beamWidth, llmRequest1); kvCacheManager.storeContextBlocks(*llmRequest1); kvCacheManager.removeSequence(1, llmRequest1); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 1); // The second partial block gets stored. No priorities updated. EXPECT_TRUE(std::holds_alternative(events.front().data)); auto inputTokens2 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto llmRequest2 = std::make_shared(2, 0, inputTokens2, samplingConfig, true); llmRequest2->setKvCacheRetentionConfig(tle::KvCacheRetentionConfig( std::vector{tle::KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 10)}, 35)); // Update the context block priorities kvCacheManager.addSequence(2, inputTokens2->size(), beamWidth, llmRequest2); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 2); for (int i = 0; i < 2; i++) { auto diff = std::get(events.front().data).priority; EXPECT_EQ(diff->oldValue, 50); EXPECT_EQ(diff->newValue, 10); events.pop_front(); } } TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamBlocking) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 2; auto constexpr onboardBlocks = true; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManagerTest(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, dtype, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt); EXPECT_EQ(getEvents(kvCacheManagerTest).size(), 0); KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1024)); kvCacheManager.allocatePools(false); kvCacheManager.flushIterationEvents(); auto events = kvCacheManager.getLatestEvents(std::chrono::seconds(1)); EXPECT_EQ(events.size(), 1); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); kvCacheManager.addSequence(0, inputTokens0->size(), beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); kvCacheManager.flushIterationEvents(); events = kvCacheManager.getLatestEvents(std::chrono::seconds(1)); EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); } TEST_F(KVCacheManagerTest, KVCacheTransferManagerConcurrencyTest) { auto const blockSize = 16384; auto bufferManager = tensorrt_llm::runtime::BufferManager(std::make_shared()); auto transferManager = KVCacheTransferManager(bufferManager); auto pool = KVCacheBlockPool(0, 0, 0, 0, 1); pool.primaryPtr = bufferManager.gpu(tr::ITensor::makeShape({1, blockSize}), nvinfer1::DataType::kFLOAT); bufferManager.setZero(*pool.primaryPtr); pool.secondaryPtr = tr::BufferManager::pinned(tr::ITensor::makeShape({1, blockSize}), nvinfer1::DataType::kFLOAT); // Write some specific data into the cpu blocks. for (int i = 0; i < blockSize; i++) { tr::bufferCast(*pool.secondaryPtr)[i] = 1; } auto primaryBlock = std::make_shared(0, tensorrt_llm::kernels::KVCacheIndex(0, false)); auto secondaryBlock = std::make_shared(1, tensorrt_llm::kernels::KVCacheIndex(0, true)); transferManager.offload(primaryBlock, secondaryBlock, {pool}); primaryBlock->swapMemoryPoolBlockOffset(secondaryBlock); transferManager.onboard(primaryBlock, secondaryBlock, {pool}); transferManager.syncTransfers(); transferManager.offload(primaryBlock, secondaryBlock, {pool}); bufferManager.getStream().synchronize(); for (int i = 0; i < blockSize; i++) { EXPECT_EQ(tr::bufferCast(*pool.secondaryPtr)[i], 0); } } TEST_P(KVCacheManagerTest, KVCacheManagerSinkTokenLengthTest) { using DType = half; using SizeType32 = KVCacheManager::SizeType32; auto constexpr numLayers = 4; auto constexpr numHeads = 2; // heterogeneous layers std::vector const numHeadsPerLayer({3, 2, 1, 2}); std::map const expectedHeadsPerPool({{0, 1}, {1, 2}, {2, 3}}); std::map const expectedLayersPerPool({{0, 1}, {1, 2}, {2, 1}}); auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 4; auto constexpr sinkTokenLength = 4; auto const stream = std::make_shared(); auto constexpr requestId = static_cast(7); auto constexpr sinkTokensInLastBlock = sinkTokenLength % tokensPerBlock; auto constexpr bubbleLength = (sinkTokensInLastBlock) ? tokensPerBlock - sinkTokensInLastBlock : 0; auto constexpr inputLength = tokensPerBlock * maxBlocksPerSeq - bubbleLength - 1; auto constexpr maxAttentionWindow = inputLength - tokensPerBlock; auto constexpr numSharedBlocks = (sinkTokenLength + bubbleLength) / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr numSharedBlocksCtx = (inputLength + bubbleLength) / tokensPerBlock; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const expectedNumPools = homogeneousLayers ? 1 : static_cast(expectedHeadsPerPool.size()); auto const maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocksCtx - maxBeamWidth); tr::ITensor::SharedPtr kvCacheBlockOffsets = tr::BufferManager::cpu( tr::ITensor::makeShape({expectedNumPools, maxNumSequences * maxBeamWidth, 2, maxBlocksPerSeq}), tr::TRTDataType::value); kvCacheManager.copyBlockOffsets(*kvCacheBlockOffsets, 0, requestId); EXPECT_EQ(blockManager.getNumPools(), expectedNumPools); if (homogeneousLayers) { EXPECT_EQ(blockManager.getBlockSize(0), numHeads * sizePerHead * tokensPerBlock); } else { for (auto layerIdx = 0; layerIdx < numLayers; layerIdx++) { EXPECT_EQ(blockManager.getBlockSize(blockManager.getLayerPoolIdx(layerIdx)), numHeadsPerLayer.at(layerIdx) * sizePerHead * tokensPerBlock); } } { for (auto poolIdx = 0; poolIdx < blockManager.getNumPools(); poolIdx++) { auto const numLayersInPool = homogeneousLayers ? numLayers : expectedLayersPerPool.at(poolIdx); auto const offsetBetweenBlocks = numLayersInPool * 2; tr::ITensor::SharedPtr blockOffsetsSlice = tr::ITensor::slice(tr::ITensor::at(kvCacheBlockOffsets, {poolIdx}), 0, maxBeamWidth); auto blockOffsetsShape = blockOffsetsSlice->getShape(); auto const blockOffsetsPtr = tr::bufferCast(*blockOffsetsSlice); tk::KVCacheIndex::UnderlyingType runningSum{0}; for (auto block = 0; block < numSharedBlocksCtx; ++block) { for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; } runningSum += offsetBetweenBlocks; } { auto const block = numSharedBlocksCtx; for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; runningSum += offsetBetweenBlocks; } } } } // replace the shared block with unshared blocks EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocksCtx - maxBeamWidth * 2 + 1); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocksCtx - maxBeamWidth * 2 + 1); EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); auto currentNumBlocks = totalNumBlocks; for (auto requestCounter = 0; requestCounter < maxNumSequences; ++requestCounter) { auto const nextRequestId = static_cast(requestId + requestCounter); EXPECT_NO_THROW(kvCacheManager.addSequence(nextRequestId, inputLength, maxBeamWidth)); currentNumBlocks -= numSharedBlocksCtx + maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.addToken(nextRequestId)); currentNumBlocks -= maxBeamWidth - 1; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } auto numUsedBlocks = maxNumSequences * (numSharedBlocksCtx + maxBeamWidth * 2 - 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numUsedBlocks); } TEST_P(KVCacheManagerTest, KVCacheManagerBatchTest) { using DType = half; using SizeType32 = KVCacheManager::SizeType32; auto constexpr numLayers = 4; auto constexpr numHeads = 2; // heterogeneous layers std::vector const numHeadsPerLayer({3, 2, 1, 2}); std::map const expectedHeadsPerPool({{0, 1}, {1, 2}, {2, 3}}); std::map const expectedLayersPerPool({{0, 1}, {1, 2}, {2, 1}}); auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 32; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 4; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxNumTokens; auto constexpr inputLength = maxNumTokens - 2; auto constexpr numBlocksPerSeq = maxBlocksPerSeq - 1 + maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const expectedNumPools = homogeneousLayers ? 1 : static_cast(expectedHeadsPerPool.size()); KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); auto const currentNumBlocks = totalNumBlocks - (requestId + 1) * numBlocksPerSeq; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } tr::ITensor::SharedPtr const kvCacheBlockOffsets = tr::BufferManager::cpu( tr::ITensor::makeShape({expectedNumPools, maxNumSequences * maxBeamWidth, 2, maxBlocksPerSeq}), tr::TRTDataType::value); kvCacheManager.getBlockOffsetsOfBatch(*kvCacheBlockOffsets, 0, maxNumSequences, maxBeamWidth); EXPECT_EQ(blockManager.getNumPools(), expectedNumPools); if (homogeneousLayers) { EXPECT_EQ(blockManager.getBlockSize(0), numHeads * sizePerHead * tokensPerBlock); } else { for (auto layerIdx = 0; layerIdx < numLayers; layerIdx++) { EXPECT_EQ(blockManager.getBlockSize(blockManager.getLayerPoolIdx(layerIdx)), numHeadsPerLayer.at(layerIdx) * sizePerHead * tokensPerBlock); } } { for (auto poolIdx = 0; poolIdx < blockManager.getNumPools(); poolIdx++) { auto const numLayersInPool = homogeneousLayers ? numLayers : expectedLayersPerPool.at(poolIdx); auto const offsetBetweenBlocks = numLayersInPool * 2; tr::ITensor::SharedPtr const blockOffsetsSlice = tr::ITensor::slice( tr::ITensor::at(kvCacheBlockOffsets, {poolIdx}), 0, maxNumSequences * maxBeamWidth); auto blockOffsetsShape = blockOffsetsSlice->getShape(); auto* const blockOffsetsPtr = tr::bufferCast(*blockOffsetsSlice); tk::KVCacheIndex::UnderlyingType runningSum{0}; for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { for (auto block = 0; block < maxBlocksPerSeq - 1; ++block) { for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, requestId * maxBeamWidth + beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, requestId * maxBeamWidth + beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; } runningSum += offsetBetweenBlocks; } auto const block = maxBlocksPerSeq - 1; { for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, requestId * maxBeamWidth + beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, requestId * maxBeamWidth + beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; runningSum += offsetBetweenBlocks; } } } } } } namespace { void testNeededBlocksOneStep(bool kv_cache_block_reuse, int beamWidth, int draftLen, bool homogeneousLayers) { using DType = half; using SizeType32 = KVCacheManager::SizeType32; auto constexpr numLayers = 4; auto constexpr numHeads = 2; // heterogeneous layers std::vector const numHeadsPerLayer({3, 2, 1, 2}); auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 8; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr totalNumBlocks = maxNumSequences * maxBlocksPerSeq; TLLM_CHECK(draftLen == 0 || beamWidth == 1); // Deal with one sequence for now auto constexpr requestId = static_cast(7); SizeType32 maxNewTokens = 20; bool isStreaming = false; SizeType32 const maxInputLength{65}; SizeType32 const maxMaxBeamWidth{beamWidth}; for (int maxBeamWidth = 1; maxBeamWidth <= maxMaxBeamWidth; ++maxBeamWidth) { tr::SamplingConfig const samplingConfig{maxBeamWidth}; for (int inputLength = 1; inputLength < maxInputLength; ++inputLength) { auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq; // auto constexpr maxAttentionWindow = maxNumTokens / 2; auto constexpr maxAttentionWindow = 46; auto constexpr totalNumBlocks = maxNumSequences * maxBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, kv_cache_block_reuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, totalNumBlocks, blocksInSecondaryPool, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, kv_cache_block_reuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, tc::ceilDiv(maxAttentionWindow, tokensPerBlock)); auto inputTokens = std::make_shared(VecTokens(inputLength, 0)); auto draftTokens = std::make_shared>(draftLen); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); llmRequest->setDraftTokens(draftTokens); auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); auto remainingBlocksToCompletion = kvCacheManager.getRemainingBlocksToCompletion(*llmRequest, onlyWindowSize); auto neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false, onlyWindowSize); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth, llmRequest)); for (int di = 0; di < draftLen && di < maxNewTokens && (inputLength + di) < maxAttentionWindow; ++di) { for (int beam = 0; beam < maxBeamWidth; beam++) { llmRequest->addNewToken(1, beam); } EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); } auto numUsedBlocksThisStep = kvCacheManager.getUsedNumBlocks(); EXPECT_EQ(numUsedBlocksThisStep, neededBlocksOneStep); // Simulate adding new tokens during generation llmRequest->setState(LlmRequestState::kGENERATION_IN_PROGRESS); for (int i = draftLen; i < maxNewTokens && (inputLength + i) < maxAttentionWindow; i += (draftLen + 1)) { auto numCurrentlyUsedBlocks = kvCacheManager.getUsedNumBlocks(); for (int beam = 0; beam < maxBeamWidth; beam++) { llmRequest->addNewToken(1, beam); } neededBlocksOneStep = kvCacheManager.getNeededBlocksOneStep(*llmRequest, false, onlyWindowSize); for (int beam = 0; beam < maxBeamWidth; beam++) { for (int di = 0; di < draftLen && (i + di) < maxNewTokens && (inputLength + i + di) < maxAttentionWindow; ++di) { llmRequest->addNewToken(1, beam); } } for (int di = 0; di < draftLen + 1 && (i + di) < maxNewTokens && (inputLength + i + di) < maxAttentionWindow; ++di) { EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); } numUsedBlocksThisStep = kvCacheManager.getUsedNumBlocks() - numCurrentlyUsedBlocks; EXPECT_EQ(numUsedBlocksThisStep, neededBlocksOneStep); } // After adding all tokens, we should match remainingBlocksToCompletion EXPECT_EQ(remainingBlocksToCompletion, kvCacheManager.getUsedNumBlocks()); EXPECT_EQ(kvCacheManager.getRemainingBlocksToCompletion(*llmRequest, onlyWindowSize), 0); } } } } // namespace TEST_P(KVCacheManagerTest, neededBlocksOneStepKvCacheBlockReuse) { testNeededBlocksOneStep(true, 1, 0, GetParam()); // maxBeamWidth is 1 when kv cache reuse is enabled } TEST_P(KVCacheManagerTest, neededBlocksOneStep) { testNeededBlocksOneStep(false, 4, 0, GetParam()); } TEST_P(KVCacheManagerTest, neededBlocksOneStepKvCacheBlockReuseDraftTokens) { testNeededBlocksOneStep(true, 1, 5, GetParam()); } INSTANTIATE_TEST_SUITE_P(KVCacheManagerTest, KVCacheManagerTest, testing::Values(true, false), // homogeneousLayers generateTestName); // calculateMaxBlockRequirementsPerBeam using BlockRequirementsParamType = std::tuple; class BlockRequirementsParamTest : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} }; TEST_P(BlockRequirementsParamTest, TestCaculateMaxBlocksRequirement) { BlockRequirementsParamType param = GetParam(); auto const result = KVCacheManager::calculateMaxBlockRequirementsPerBeam( std::get<0>(param), std::get<1>(param), std::get<2>(param), std::get<3>(param)); ASSERT_EQ(result, std::get<4>(param)); } INSTANTIATE_TEST_SUITE_P(CalculateMaxBlockRequirementsPerBeam, BlockRequirementsParamTest, testing::Values(std::make_tuple(512, 0, 1024, 64, 8), std::make_tuple(513, 0, 1024, 64, 9), std::make_tuple(512, 0, 256, 64, 4), std::make_tuple(512, 0, 257, 64, 5), std::make_tuple(512, 64, 1024, 64, 8), std::make_tuple(513, 64, 1024, 64, 9), std::make_tuple(512, 64, 256, 64, 4), std::make_tuple(512, 64, 257, 64, 5), std::make_tuple(512, 65, 1024, 64, 9), std::make_tuple(513, 65, 1024, 64, 9), std::make_tuple(512, 65, 256, 64, 5), std::make_tuple(512, 65, 257, 64, 5))); // calculateMaxBlockRequirements TEST(CalculateMaxBlockRequirements, BeamWidthOneEqualRequirementsPerBeam) { auto const perBeamResult = KVCacheManager::calculateMaxBlockRequirementsPerBeam(512, 0, 2048, 64); auto const result = KVCacheManager::calculateMaxBlockRequirements(257, 255, 0, 2048, 1, 64); ASSERT_EQ(result, perBeamResult); } TEST(CalculateMaxBlockRequirements, AttentionWindowFitsOutputEqualSingleBeamTimesBeamWidth) { auto constexpr beamWidth = 4; auto constexpr attentionWindow = 128; auto constexpr outputLength = 255; auto const perBeamResult = KVCacheManager::calculateMaxBlockRequirementsPerBeam(512, 0, attentionWindow, 64); auto const result = KVCacheManager::calculateMaxBlockRequirements(257, 255, 0, attentionWindow, beamWidth, 64); ASSERT_EQ(result, perBeamResult * beamWidth); } TEST(CalculateMaxBlockRequirements, AttentionWindowOverlapsInputAndOutputReferenceResult) { auto constexpr beamWidth = 4; auto constexpr attentionWindow = 412; auto constexpr outputLength = 255; auto const perBeamResult = KVCacheManager::calculateMaxBlockRequirementsPerBeam(512, 0, attentionWindow, 64); auto const result = KVCacheManager::calculateMaxBlockRequirements(257, 255, 0, attentionWindow, beamWidth, 64); auto const numContextBlocks = 2; // (412 - 255) / 64 // There are 29 context tokens left over to be put in output blocks, so 284 tokens to fit in output blocks in total: // 5 blocks auto const numOutputBlocks = 5 * beamWidth; ASSERT_EQ(result, numContextBlocks + numOutputBlocks); } // calculateMaxAttentionWindow TEST(CalculateMaxAttentionWindow, OutputTooLargeForCapacity) { auto constexpr beamWidth = 4; auto constexpr blockCapacity = 12; auto constexpr outputLength = 255; auto const result = KVCacheManager::calculateMaxAttentionWindow(1024, outputLength, 0, blockCapacity, beamWidth, 64); ASSERT_EQ(result, 192); } TEST(CalculateMaxAttentionWindow, CapacityIsEnoughForWholeSequence) { auto constexpr beamWidth = 4; auto constexpr blockCapacity = 256; auto constexpr outputLength = 1024; auto constexpr inputLength = 1024; auto const result = KVCacheManager::calculateMaxAttentionWindow(inputLength, outputLength, 0, blockCapacity, beamWidth, 64); ASSERT_EQ(result, inputLength + outputLength); } namespace { struct KvCacheManagerInstantiationParameters { SizeType32 numLayers; std::variant> numHeadsPerLayer; SizeType32 sizePerHead; SizeType32 tokensPerBlock; SizeType32 numBlocksInPrimaryPool; SizeType32 sinkTokenLength; SizeType32 maxAttentionWindow; SizeType32 maxBeamWidth; SizeType32 maxNumTokens; bool kvCacheBlockReuse; std::vector maxAttentionWindowVec = {maxAttentionWindow}; nvinfer1::DataType dtype = nvinfer1::DataType::kFLOAT; }; struct GetRemainingBlocksToCompletionOneRequestParameters { KvCacheManagerInstantiationParameters kvCacheManagerInstantiationParameters; SizeType32 promptLength; SizeType32 maxOutputLength; SizeType32 expectedRemainingBlocksToCompletion; }; struct FillKvCacheAndCompleteRequestsParameters { KvCacheManagerInstantiationParameters kvCacheManagerInstantiationParameters; SizeType32 promptLength; SizeType32 maxOutputLength; }; std::shared_ptr createKvCacheManager( KvCacheManagerInstantiationParameters const& kvCacheInstantiationParameters, StreamPtr stream) { auto const maxInputLength = kvCacheInstantiationParameters.maxNumTokens - 1; auto const temporaryKvCacheInputs = TempAttentionWindowInputs{true, maxInputLength, kvCacheInstantiationParameters.maxNumTokens}; if (std::holds_alternative(kvCacheInstantiationParameters.numHeadsPerLayer)) { auto const numHeadsPerLayer = std::get(kvCacheInstantiationParameters.numHeadsPerLayer); auto numHeadsPerLayerVec = std::vector{kvCacheInstantiationParameters.numLayers}; std::fill(numHeadsPerLayerVec.begin(), numHeadsPerLayerVec.end(), numHeadsPerLayer); return std::make_shared(numHeadsPerLayerVec, kvCacheInstantiationParameters.sizePerHead, kvCacheInstantiationParameters.tokensPerBlock, kvCacheInstantiationParameters.numBlocksInPrimaryPool, 0, kvCacheInstantiationParameters.numBlocksInPrimaryPool, kvCacheInstantiationParameters.maxBeamWidth, std::vector{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheInputs, kvCacheInstantiationParameters.dtype, kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt, kvCacheInstantiationParameters.kvCacheBlockReuse, true, CacheType::kSELF); } if (std::holds_alternative>(kvCacheInstantiationParameters.numHeadsPerLayer)) { auto const numHeadsPerLayer = std::get>(kvCacheInstantiationParameters.numHeadsPerLayer); return std::make_shared(numHeadsPerLayer, kvCacheInstantiationParameters.sizePerHead, kvCacheInstantiationParameters.tokensPerBlock, kvCacheInstantiationParameters.numBlocksInPrimaryPool, 0, kvCacheInstantiationParameters.numBlocksInPrimaryPool, kvCacheInstantiationParameters.maxBeamWidth, std::vector{kvCacheInstantiationParameters.maxAttentionWindow}, temporaryKvCacheInputs, kvCacheInstantiationParameters.dtype, kvCacheInstantiationParameters.sinkTokenLength, stream, std::nullopt, kvCacheInstantiationParameters.kvCacheBlockReuse, true, CacheType::kSELF); } TLLM_THROW("Unhandled type of num heads per layer provided."); } std::vector fillKvCacheManager(KVCacheManager& kvCacheManager, SizeType32 promptLength, SizeType32 maxOutputLength, SizeType32 maxBeamWidth, RequestIdType requestIdStart) { auto inputTokens = std::make_shared>(static_cast(promptLength)); auto const llmRequest = LlmRequest{ 0, maxOutputLength, inputTokens, tensorrt_llm::runtime::SamplingConfig{maxBeamWidth}, true, }; // Adding enough requests to fill the kv-cache. auto remainingFreeBlocks = kvCacheManager.getNumFreeBlocks(); auto llmRequests = std::vector{}; auto const remainingBlocksToCompletionFromStart = kvCacheManager.getRemainingBlocksToCompletion(llmRequest, theOnlyWindowSize(kvCacheManager)); do { ++requestIdStart; std::fill(inputTokens->begin(), inputTokens->end(), requestIdStart); llmRequests.emplace_back( requestIdStart, maxOutputLength, inputTokens, tensorrt_llm::runtime::SamplingConfig{maxBeamWidth}, true); remainingFreeBlocks -= remainingBlocksToCompletionFromStart; } while (remainingFreeBlocks >= remainingBlocksToCompletionFromStart); for (auto request : llmRequests) { kvCacheManager.addSequence(request.mRequestId, request.getPromptLen(), maxBeamWidth, request); request.mState = LlmRequestState::kGENERATION_IN_PROGRESS; kvCacheManager.storeContextBlocks(request); } kvCacheManager.refreshBlocks(); return llmRequests; } } // namespace class RemainingBlocksToCompletionTest : public ::testing::TestWithParam { protected: void SetUp() override { auto const stream = std::make_shared(); auto const params = GetParam(); kvCacheManager = createKvCacheManager(params.kvCacheManagerInstantiationParameters, stream); kvCacheManager->allocatePools(false); } void TearDown() override {} std::shared_ptr kvCacheManager; }; TEST_P(RemainingBlocksToCompletionTest, RemainingBlocksToCompletionCorrectlyEstimated) { auto const params = GetParam(); auto const inputTokens = std::make_shared>(static_cast(params.promptLength)); auto const llmRequest = LlmRequest{ 0, params.maxOutputLength, inputTokens, tensorrt_llm::runtime::SamplingConfig{params.kvCacheManagerInstantiationParameters.maxBeamWidth}, true, }; auto const result = kvCacheManager->getRemainingBlocksToCompletion(llmRequest, theOnlyWindowSize(*kvCacheManager)); ASSERT_EQ(result, params.expectedRemainingBlocksToCompletion); } INSTANTIATE_TEST_SUITE_P(RemainingBlocksToCompletionCorrectlyEstimated, RemainingBlocksToCompletionTest, ::testing::Values( GetRemainingBlocksToCompletionOneRequestParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 1, 4096, 0, 4096, 1, 4096 * 4, false, }, 1024, 128, 1024 + 128, }, GetRemainingBlocksToCompletionOneRequestParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096, 0, 4096, 1, 4096 * 4, false, }, 1024, 128, 18, }, GetRemainingBlocksToCompletionOneRequestParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096, 0, 128, 1, 4096 * 4, false, }, 1024, 128, 18, // See `temporaryAttentionWindow` concept. })); class FillKvCacheAndCompleteRequestsTest : public ::testing::TestWithParam { protected: void SetUp() override { auto const stream = std::make_shared(); auto const params = GetParam(); kvCacheManager = createKvCacheManager(params.kvCacheManagerInstantiationParameters, stream); kvCacheManager->allocatePools(false); } void TearDown() override {} std::shared_ptr kvCacheManager; }; TEST_P(FillKvCacheAndCompleteRequestsTest, FillKvCacheWithRequestsAndCompleteOneByOne) { auto const params = GetParam(); auto llmRequests = fillKvCacheManager(*kvCacheManager, params.promptLength, params.maxOutputLength, params.kvCacheManagerInstantiationParameters.maxBeamWidth, 0); // Completing half. for (auto& llmRequest : llmRequests) { for (SizeType32 i = 0; i < params.maxOutputLength; i++) { llmRequest.addNewToken(0, 0); kvCacheManager->addToken(llmRequest.mRequestId); } kvCacheManager->removeSequence(llmRequest.mRequestId, llmRequest); } ASSERT_EQ(kvCacheManager->getNumFreeBlocks(), params.kvCacheManagerInstantiationParameters.numBlocksInPrimaryPool); } TEST_P(FillKvCacheAndCompleteRequestsTest, FillKvCacheAndCompleteInParallel) { auto const params = GetParam(); auto llmRequests = fillKvCacheManager(*kvCacheManager, params.promptLength, params.maxOutputLength, params.kvCacheManagerInstantiationParameters.maxBeamWidth, 0); for (SizeType32 i = 0; i < params.maxOutputLength; i++) { for (auto const& llmRequest : llmRequests) { kvCacheManager->addToken(llmRequest.mRequestId); } } } auto const paramValues = ::testing::Values( FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 1, 4096, 0, 4096, 1, 4096 * 4, false, }, 128, 128, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096, 0, 4096, 1, 4096 * 4, false, }, 256, 128, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096, 0, 4096, 1, 4096 * 4, false, }, 256, 128, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096, 0, 4096, 1, 4096 * 4, false, }, 500, 100, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096, 0, 128, 1, 4096 * 4, false, }, 500, 100, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 128, 0, 4096, 1, 4096 * 4, false, }, 5250, 500, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096, 0, 2048, 1, 4096 * 4, false, }, 5000, 500, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 5000, 500, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 500, 5000, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 2048, 2048, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 2049, 2048, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 2047, 2048, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 1024, 1024, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 1025, 1023, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 1023, 1025, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 2047, 32, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 2049, 32, }, FillKvCacheAndCompleteRequestsParameters{ KvCacheManagerInstantiationParameters{ 1, 1, 1, 64, 4096 * 16, 0, 2048, 1, 4096 * 4, true, }, 2048 - 60, 32, }); INSTANTIATE_TEST_SUITE_P(FillKvCacheAndCompleteRequestsTest, FillKvCacheAndCompleteRequestsTest, paramValues);