/* * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/batch_manager/kvCacheManager.h" #include "tensorrt_llm/batch_manager/common.h" #include "tensorrt_llm/batch_manager/kvCacheEventManager.h" #include "tensorrt_llm/batch_manager/kvCacheTransferManager.h" #include "tensorrt_llm/batch_manager/kvCacheUtils.h" #include "tensorrt_llm/batch_manager/llmRequest.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/executor/transferAgent.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/kernels/kvCacheIndex.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/samplingConfig.h" #include "gtest/gtest.h" #include #include #include #include #include #include #include #include #include #include #include #include using namespace tensorrt_llm::batch_manager; using namespace tensorrt_llm::batch_manager::kv_cache_manager; using namespace tensorrt_llm::executor; namespace tc = tensorrt_llm::common; namespace tk = tensorrt_llm::kernels; namespace tlk = tensorrt_llm::batch_manager::kv_cache_manager; namespace tle = tensorrt_llm::executor; namespace tr = tensorrt_llm::runtime; namespace fs = std::filesystem; using BlocksPerWindow = std::map>; using ParamType = bool; namespace { std::string generateTestName(testing::TestParamInfo const& info) { auto const homogeneousLayers = info.param; std::string name = "KVCacheManagerTest"; if (homogeneousLayers) { name += "Homogeneous"; } else { name += "Heterogeneous"; } return name; } SizeType32 theOnlyWindowSize(KVCacheManager const& kvCacheManager) { auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getWindowSizesMetadata().size(), 1) << "Assuming a single window size"; return blockManager.getPoolWindowSize(0); } } // namespace class KVCacheManagerTest : public ::testing::Test, public ::testing::WithParamInterface // NOLINT(cppcoreguidelines-pro-type-member-init) { protected: void SetUp() override { auto const deviceCount = tc::getDeviceCount(); if (deviceCount == 0) { GTEST_SKIP(); } } void TearDown() override {} }; namespace { // TODO: This is really ugly. Flushing the event queue is done in a separate thread, so if we want to check the value we // need to wait for the thread to complete. It works, but it's technically not deterministic. std::deque getEvents(KVCacheManager& kvCacheManager) { kvCacheManager.flushIterationEvents(); std::this_thread::sleep_for(std::chrono::milliseconds(100)); return kvCacheManager.getLatestEvents(std::chrono::milliseconds(100)); } } // namespace TEST_F(KVCacheManagerTest, BlockManagerTest) { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 128; auto constexpr tokensPerBlock = 64; auto constexpr blocksInPrimaryPool = 24; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr beamWidth = 8; auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth; auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam; auto constexpr maxAttentionWindow = numTokens; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); auto constexpr requestId = 42; GenerationRequest seq0{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false); auto constexpr occupiedBlocks = (numBlocksPerBeam - 1) + beamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - occupiedBlocks); auto const& ids = seq0.getCacheBlockIds(maxAttentionWindow); std::set idSet{}; EXPECT_EQ(ids.size(), beamWidth); for (auto const& beam : ids) { EXPECT_EQ(beam.size(), blocksInPrimaryPool / beamWidth); idSet.insert(beam.begin(), beam.end()); } EXPECT_EQ(idSet.size(), occupiedBlocks); blockManager.releaseBlocks(seq0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/true); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocksPerBeam); EXPECT_EQ(ids.size(), beamWidth); for (std::size_t i = 0u; i < ids.front().size(); ++i) { for (std::size_t beam = 1u; beam < ids.size(); ++beam) { EXPECT_EQ(ids.at(beam).at(i), ids.at(0).at(i)); } } blockManager.releaseBlocks(seq0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); // occupy 22/24 blocks EXPECT_NO_THROW( blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false)); GenerationRequest seq1{requestId + 1, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; blockManager.holdSequence(seq1.getRequestId()); EXPECT_NO_THROW( blockManager.addSequence(seq1, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false)); // same requestId not allowed GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; blockManager.holdSequence(seq2.getRequestId()); EXPECT_THROW( blockManager.addSequence(seq2, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false), std::runtime_error); // no more blocks GenerationRequest seq3{requestId + 2, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; blockManager.holdSequence(seq3.getRequestId()); EXPECT_THROW( blockManager.addSequence(seq3, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false), std::runtime_error); } template void writePatternToOffloadedBlocksDRAM(T* rawBlockPtr, int blockSize, int mask) { for (int i = 0; i < blockSize; ++i) { rawBlockPtr[i] = i & mask; } } template void writePatternToOffloadedBlocksGDS( std::string const& directory, int blockId, SizeType32 numPools, int blockSize, int mask) { for (size_t poolIdx = 0; poolIdx < numPools; ++poolIdx) { std::string filename = directory + "/block_" + std::to_string(blockId) + "_pool_" + std::to_string(poolIdx) + ".bin"; int fd = ::open(filename.c_str(), O_WRONLY); if (fd >= 0) { auto poolBlockSize = blockSize / numPools; std::vector buffer(poolBlockSize); for (int i = 0; i < poolBlockSize; ++i) { buffer[i] = i & mask; } auto const bytesToWrite = static_cast(poolBlockSize) * sizeof(T); auto const written = ::write(fd, buffer.data(), bytesToWrite); EXPECT_EQ(written, static_cast(bytesToWrite)) << "Failed to write pattern to offloaded block file " << filename; ::close(fd); } } } template void runPartialCopyTest() { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 128; auto constexpr tokensPerBlock = 8; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 4; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr batchSize = 1; auto constexpr maxBlocksPerSeq = 10; auto constexpr bytesPerToken = 4; auto constexpr maxAttentionWindow = 4096; auto constexpr maxAttentionWindowAllLayer = 4096; auto constexpr sinkTokenLen = 0; auto constexpr canUseOneMoreBlock = true; std::string directory = ""; static int file_num = 0; if constexpr (transferMode == KvCacheTransferMode::GDS) { std::string filename = std::string("test_copy") + std::to_string(file_num++); auto dirPath = fs::absolute(filename); fs::create_directories(dirPath); directory = dirPath.string(); } SizeType32 constexpr maxNewTokens{0}; auto constexpr beamWidth = 1; auto constexpr beamIdx = 0; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, type, 0, onboardBlocks); blockManager.allocatePools(false); auto oneLayerBlockSize = blockManager.getBlockSize(0); EXPECT_EQ(oneLayerBlockSize, numKvHeads * sizePerHead * tokensPerBlock); auto primaryPoolPtr = blockManager.getPrimaryPool(0); auto secondaryPoolPtr = blockManager.getSecondaryPool(0); tk::KVBlockArray kvCacheBlockArray(batchSize, maxBlocksPerSeq, tokensPerBlock, bytesPerToken, maxAttentionWindow, maxAttentionWindowAllLayer, sinkTokenLen, canUseOneMoreBlock, primaryPoolPtr->data(), secondaryPoolPtr->data(), nullptr); // Verify that shape of block for one layer of K or V is [numKvHeads, tokensPerBlock, sizePerHead] by comparing // against KVBlockArray::getKVLocalIdx method. We make this assumption in partialCopy kernel. auto constexpr localTokenIdx = 3; auto constexpr headIdx = 5; auto constexpr channelIdx = 7; auto localKIdx = kvCacheBlockArray.getKVLocalIdx(localTokenIdx, headIdx, sizePerHead, channelIdx); EXPECT_EQ(localKIdx, (headIdx * tokensPerBlock + localTokenIdx) * sizePerHead + channelIdx); // Pool block has shape [2, numLayers, numKvHeads, tokensPerBlock, sizePerHead] auto blockSize = 2 * numLayers * oneLayerBlockSize; auto primaryPoolSize = blocksInPrimaryPool * blockSize; auto secondaryPoolSize = blocksInSecondaryPool * blockSize; // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] (17 tokens, three blocks) auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); auto cacheBlockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds, ::testing::ElementsAreArray({0, 1, 2})); // Offload all 3 blocks, fill with predictable pattern, onboard for (auto cacheBlockId : cacheBlockIds) { auto block = blockManager.getBlockById(cacheBlockId, maxAttentionWindow); EXPECT_TRUE(block->isPrimary()); // offload so we can write to block in CPU code blockManager.offloadBlock(block, maxAttentionWindow, transferMode, directory); EXPECT_FALSE(block->isPrimary()); // need to sync so D2H transfer is done before accessing blocks EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); // fill with predictable pattern auto memoryPoolIndex = block->getMemoryPoolBlockIndex(); auto blockPtr{tr::ITensor::slice(secondaryPoolPtr, memoryPoolIndex, 1)}; auto rawBlockPtr = reinterpret_cast(blockPtr->data()); // Write value if constexpr (transferMode == KvCacheTransferMode::DRAM) { writePatternToOffloadedBlocksDRAM(rawBlockPtr, blockSize, mask); } else if constexpr (transferMode == KvCacheTransferMode::GDS) { auto block_id = block->getBlockId(); auto numPools = blockManager.getNumPools(false); writePatternToOffloadedBlocksGDS(directory, block_id, numPools, blockSize, mask); } // onboard blockManager.onboardBlock(seq0, block, maxAttentionWindow, transferMode, directory); EXPECT_TRUE(block->isPrimary()); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); EXPECT_TRUE(blockManager.verifyQueueIntegrity(maxAttentionWindow)); } blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseSequence(seq0.getRequestId()); // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] auto inputTokens1 = inputTokens; auto const inputLength1 = static_cast(inputTokens1->size()); requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming); GenerationRequest seq1{requestId, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 16); auto cacheBlockIds1 = seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds1, ::testing::ElementsAreArray({0, 1, 6})); // store blocks 0, 1 ([0,1,2,3,4,5,6,7], [8,9,10,11,12,13,14,15]) blockManager.storeContextBlocks(seq1, *llmRequest1); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11] again. // Reuse blocks 0 and 1(pc). Block 1 is partially reused, but already referenced by seq1 so must be partial copied // into new block 2. Clear block 2 so we can see what was partial copied. auto block2 = blockManager.getBlockById(2, maxAttentionWindow); auto memoryPoolIndex2 = block2->getMemoryPoolBlockIndex(); auto block2Ptr{tr::ITensor::slice(primaryPoolPtr, memoryPoolIndex2, 1)}; EXPECT_EQ(cudaMemset(block2Ptr->data(), 0, blockSize * sizeof(T)), cudaSuccess); auto inputTokens2 = inputTokens; auto constexpr partiallyReusedTokens = 3; inputTokens2->resize(8 + partiallyReusedTokens + 1); auto const inputLength2 = static_cast(inputTokens2->size()); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming); GenerationRequest seq2{requestId, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()}; auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 11); auto cacheBlockIds2 = seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds2, ::testing::ElementsAreArray({0, 2})); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); // Verify partial copied block 2 // Block has shape [2, numLayers, numKvHeads, tokensPerBlock, sizePerHead] blockManager.offloadBlock(block2, maxAttentionWindow); EXPECT_FALSE(block2->isPrimary()); // need to sync so D2H transfer is done before accessing blocks EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); memoryPoolIndex2 = block2->getMemoryPoolBlockIndex(); block2Ptr = tr::ITensor::slice(secondaryPoolPtr, memoryPoolIndex2, 1); T const* rawPtr2 = reinterpret_cast(block2Ptr->data()); int numBad = 0; for (int i = 0; i < blockSize && numBad < 10; ++i) { T value = rawPtr2[i]; int kOrV = i / (numLayers * numKvHeads * tokensPerBlock * sizePerHead); int j = i - kOrV * (numLayers * numKvHeads * tokensPerBlock * sizePerHead); int layer = j / (numKvHeads * tokensPerBlock * sizePerHead); j = j - layer * (numKvHeads * tokensPerBlock * sizePerHead); int head = j / (tokensPerBlock * sizePerHead); j = j - head * (tokensPerBlock * sizePerHead); int token = j / sizePerHead; j = j - token * sizePerHead; T expectedValue = (token < partiallyReusedTokens) ? i & mask : 0; if (value != expectedValue) { TLLM_LOG_WARNING( "block2[%d,%d,%d,%d,%d] - expected %d, actual %d", kOrV, layer, head, token, j, expectedValue, value); ++numBad; } } EXPECT_EQ(numBad, 0); blockManager.onboardBlock(seq2, block2, maxAttentionWindow, transferMode, directory); EXPECT_TRUE(block2->isPrimary()); EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess); blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseSequence(seq1.getRequestId()); blockManager.releaseSequence(seq2.getRequestId()); if constexpr (transferMode == KvCacheTransferMode::GDS) fs::remove_all(directory); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT64) { runPartialCopyTest(); runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT32) { runPartialCopyTest(); runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFLOAT) { runPartialCopyTest(); runPartialCopyTest(); } #ifdef ENABLE_BF16 TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyBF16) { runPartialCopyTest(); runPartialCopyTest(); } #endif TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyHALF) { runPartialCopyTest(); runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyBOOL) { runPartialCopyTest(); runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyUINT8) { runPartialCopyTest(); runPartialCopyTest(); } TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT8) { runPartialCopyTest(); runPartialCopyTest(); } #ifdef ENABLE_FP8 TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFP8) { runPartialCopyTest(); runPartialCopyTest(); } #endif TEST_F(KVCacheManagerTest, BlockManagerTestWindowSizeToShare) { auto constexpr numPrimaryBlocks = 16384; // Single window size { std::map> windowSizeToLayers{{1024, {0, 1, 2}}}; std::map cacheSizePerTokenPerWindow{{1024, 1}}; // Uniform cache size per token. auto result = BlockManager::calculateWindowSizeToShare(windowSizeToLayers, cacheSizePerTokenPerWindow); EXPECT_EQ(result.size(), 1); EXPECT_NEAR(result.at(1024), 1.0f, 1e-6f); // With a single window size, the entire share should be allocated to it. } // Variable window size { std::map> windowSizeToLayers{ {1024, {1}}, // contribution = 1024*1 = 1024 {4096, {0, 4, 5}}, // contribution = 4096*1 = 4096 {8192, {2, 3}}, // contribution = 8192*1 = 8192 }; // Use identical cache size per token across window sizes for simplicity. std::map cacheSizePerTokenPerWindow{{1024, 1}, {4096, 1}, {8192, 1}}; auto result = BlockManager::calculateWindowSizeToShare(windowSizeToLayers, cacheSizePerTokenPerWindow); EXPECT_EQ(result.size(), 3); // Ensure the shares sum to 1. auto const sumShares = std::accumulate( result.begin(), result.end(), 0.0f, [](float sum, auto const& kv) { return sum + kv.second; }); EXPECT_NEAR(sumShares, 1.0f, 1e-6f); // Calculate expected shares based on contributions. std::map expectedShares; std::map contributions; for (auto const& [windowSize, _] : windowSizeToLayers) { contributions[windowSize] = windowSize * 1.0f; } auto const totalContribution = std::accumulate(contributions.begin(), contributions.end(), 0.0f, [](float sum, auto const& kv) { return sum + kv.second; }); for (auto const& [windowSize, contribution] : contributions) { expectedShares[windowSize] = static_cast(contribution) / totalContribution; EXPECT_NEAR(result.at(windowSize), expectedShares[windowSize], 1e-6f); } // Verify the exact hard-coded values mentioned in the comment EXPECT_NEAR(result.at(1024), 0.0769f, 1e-4f); EXPECT_NEAR(result.at(4096), 0.3077f, 1e-4f); EXPECT_NEAR(result.at(8192), 0.6154f, 1e-4f); // Verify that when shares are converted to actual block counts, they match expected values. auto getRoundedBlocks = [&](float share) { return static_cast(std::round(share * numPrimaryBlocks)); }; EXPECT_EQ(getRoundedBlocks(result.at(1024)), 1260); EXPECT_EQ(getRoundedBlocks(result.at(4096)), 5041); EXPECT_EQ(getRoundedBlocks(result.at(8192)), 10082); } // Variable window size with different cache sizes per token per window { std::map> windowSizeToLayers{ {1024, {1}}, // contribution = 1024*(1*2) = 2048 (cache size per token per layer = 2) {4096, {0, 4, 5}}, // contribution = 4096*(3*4) = 49152 (cache size per token per layer = 4) {8192, {2, 3}}, // contribution = 8192*(2*1) = 16384 (cache size per token per layer = 1) }; // Different cache sizes per token per window. // cacheSizePerTokenPerWindow is accumulated across the layers of given window size. std::map cacheSizePerTokenPerWindow{{1024, 2}, {4096, 12}, {8192, 2}}; auto result = BlockManager::calculateWindowSizeToShare(windowSizeToLayers, cacheSizePerTokenPerWindow); EXPECT_EQ(result.size(), 3); // Ensure the shares sum to 1. auto const sumShares = std::accumulate( result.begin(), result.end(), 0.0f, [](float sum, auto const& kv) { return sum + kv.second; }); EXPECT_NEAR(sumShares, 1.0f, 1e-6f); // Calculate expected shares based on contributions with different cache sizes per token. std::map expectedShares; std::map contributions; for (auto const& [windowSize, _] : windowSizeToLayers) { auto const cacheSizePerToken = cacheSizePerTokenPerWindow.at(windowSize); contributions[windowSize] = windowSize * cacheSizePerToken; } auto const totalContribution = std::accumulate(contributions.begin(), contributions.end(), 0.0f, [](float sum, auto const& kv) { return sum + kv.second; }); for (auto const& [windowSize, contribution] : contributions) { expectedShares[windowSize] = static_cast(contribution) / totalContribution; EXPECT_NEAR(result.at(windowSize), expectedShares[windowSize], 1e-6f); } // Verify the calculated shares for different cache sizes per token EXPECT_NEAR(result.at(1024), 2048.0f / (2048.0f + 49152.0f + 16384.0f), 1e-6f); // ~0.0303 EXPECT_NEAR(result.at(4096), 49152.0f / (2048.0f + 49152.0f + 16384.0f), 1e-6f); // ~0.7273 EXPECT_NEAR(result.at(8192), 16384.0f / (2048.0f + 49152.0f + 16384.0f), 1e-6f); // ~0.2424 } // Edge case: Single layer per window with varying cache sizes { std::map> windowSizeToLayers{ {1024, {0}}, // contribution = 1024*1*8 = 8192 (cache size per token = 8) {4096, {1}}, // contribution = 4096*1*2 = 8192 (cache size per token = 2) {8192, {2}}, // contribution = 8192*1*1 = 8192 (cache size per token = 1) }; // Equal contributions but different cache sizes per token std::map cacheSizePerTokenPerWindow{{1024, 8}, {4096, 2}, {8192, 1}}; auto result = BlockManager::calculateWindowSizeToShare(windowSizeToLayers, cacheSizePerTokenPerWindow); EXPECT_EQ(result.size(), 3); // All should have equal shares since contributions are equal EXPECT_NEAR(result.at(1024), 1.0f / 3.0f, 1e-6f); EXPECT_NEAR(result.at(4096), 1.0f / 3.0f, 1e-6f); EXPECT_NEAR(result.at(8192), 1.0f / 3.0f, 1e-6f); // Ensure the shares sum to 1. auto const sumShares = std::accumulate( result.begin(), result.end(), 0.0f, [](float sum, auto const& kv) { return sum + kv.second; }); EXPECT_NEAR(sumShares, 1.0f, 1e-6f); } } TEST_F(KVCacheManagerTest, FindBlocksInReuseTreeByBlockKeysTest) { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 128; auto constexpr tokensPerBlock = 8; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 4; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr batchSize = 1; auto constexpr maxBlocksPerSeq = 10; auto constexpr bytesPerToken = 4; auto constexpr maxAttentionWindow = 4096; auto constexpr maxAttentionWindowAllLayer = 4096; auto constexpr sinkTokenLen = 0; auto constexpr canUseOneMoreBlock = true; SizeType32 constexpr maxNewTokens{0}; auto constexpr beamWidth = 1; auto constexpr beamIdx = 0; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, false, stream, true, onboardBlocks); // Add sequence [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] (17 tokens, three blocks) auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest0); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); auto cacheBlockIds = kvCacheManager.getSequence(requestId).getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds, ::testing::ElementsAreArray({0, 1, 2})); (void) kvCacheManager.removeSequence(requestId, llmRequest0); inputTokens->pop_back(); BlockKey fullKey{*inputTokens}; auto const foundFull = kvCacheManager.findBlocksInReuseTreeByBlockKey(fullKey, maxAttentionWindow); ASSERT_NE(foundFull, nullptr); auto const& lastBlock = foundFull; // Check the chain back to previous blocks auto const prev2 = lastBlock->getPrevBlock(); ASSERT_NE(prev2, nullptr); auto const prev1 = prev2->getPrevBlock(); ASSERT_NE(prev1, nullptr); EXPECT_EQ(prev1->getPrevBlock(), nullptr); } #ifdef ENABLE_FP4 TEST_F(KVCacheManagerTest, FP4BlockScaleManagementTest) { auto constexpr numLayers = 6; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 16; auto constexpr numFp4EltsPerContainer = 2; auto constexpr vectorSize = 16; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kFP4, false, stream, maxAttentionWindow, true, onboardBlocks); kvCacheManager.allocatePools(/*useUvm=*/false); // We should have one additional pool for the block scales. EXPECT_EQ(kvCacheManager.getBlockManager().getNumPools(), 2); EXPECT_EQ(kvCacheManager.getBlockManager().getNumPools(/*includeBlockScalePools=*/false), 1); EXPECT_NE(kvCacheManager.getBlockScalePoolPointers(), nullptr); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_TRUE(blockManager.containsBlockScales(1)); // Block size of pool 0 reflects the number of container elements. It is number of FP4 elements / 2. // The expected block size of pool 1 should be the number of FP4 elements / vectorSize. EXPECT_EQ(blockManager.getBlockSize(0) * numFp4EltsPerContainer / vectorSize, blockManager.getBlockSize(1)); } #endif TEST_F(KVCacheManagerTest, BlockManagerReuseTest) { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(9, beamIdx); // block 2 contains [8] llmRequest0->addNewToken(10, beamIdx); // block 2 contains [8, 9] auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 0, 1, 2 are stored for reuse (blocks contain [0, 1, 2, 3], [4, 5, 6, 7], [8, 9]) blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseSequence(seq0.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] and then remove it requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); // at this point, block 3 contains [8] llmRequest1->addNewToken(9, beamIdx); // block 3 contains [8, 9] llmRequest1->addNewToken(10, beamIdx); // block 3 contains [8, 9, 10] EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 3 matches block 2 and will be freed (blocks contain [8, 9]) blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseSequence(seq1.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] // reuse blocks 0, 1, 2(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8]) :: p = partial reuse auto inputTokens0 = std::make_shared(*inputTokens); inputTokens0->emplace_back(9); GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest0 = std::make_shared( seq0_dup.getRequestId(), maxNewTokens, inputTokens0, samplingConfig, isStreaming); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0_dup.getRequestId()); blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), promptLen0 - 1); EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // note that seq0_dup is holding blocks 0, 1 and 2 until releaseBlocks is called // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] // reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 4 auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest1 = std::make_shared( seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, isStreaming); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1_dup.getRequestId()); blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); llmRequest1->addNewToken(10, beamIdx); // block 4 contains [8, 9, 10] EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); // block 2 is stored for reuse (block contains [8]). nb! Last token of last block is never stored blockManager.releaseBlocks(seq0_dup, llmRequest0); blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 4 is stored for reuse (block contains [8, 9]). nb! Last token of last block is never stored blockManager.releaseBlocks(seq1_dup, llmRequest1); blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with less tokens // input tokens [0, 1, 2, 3, 4] auto inputLength2 = tokensPerBlock + 1; auto inputTokens2 = std::make_shared(VecTokens{inputTokens->begin(), inputTokens->begin() + inputLength2}); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens2, samplingConfig, isStreaming); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0 ([0, 1, 2, 3]), get new block 5 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 5})); llmRequest2->addNewToken(5, beamIdx); // block 5 contains [4] numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // add request with more tokens // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11] auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens3, samplingConfig, isStreaming); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8, 9]) auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq3.getRequestId()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), numTokens - 1); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); llmRequest3->addNewToken(11, beamIdx); // block 4 contains [8, 9, 11] numTokens = llmRequest3->getNumTokens(beamIdx); // one block used by both seq2 and seq3 numBlocks += tc::ceilDiv(numTokens, tokensPerBlock) - 1; EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 5 is not stored since it is last block and has only one token blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseSequence(seq2.getRequestId()); // block 4 is stored for reuse (block contains [8, 9]). nb! Last token of last block not stored blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseSequence(seq3.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with 11 tokens, then discard few tokens from request and release a shorter one auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12}); auto inputTokens4Short = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); requestId = 4; auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming); numTokens = llmRequest4->getNumTokens(beamIdx); GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1, 4(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8,9]) auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq4.getRequestId()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 1); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); auto llmRequest4Short = std::make_shared(requestId, maxNewTokens, inputTokens4Short, samplingConfig, isStreaming); // llmRequest4Short tokens [0, 1, 2, 3, 4, 5, 6, 7, 8] // blocks 0 and 1 ([0, 1, 2, 3], [4, 5, 6, 7]) are already stored, // block 4 is freed blockManager.releaseBlocks(seq4, llmRequest4Short); blockManager.releaseSequence(seq4.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with 11 tokens again and make sure no discarded tokens reuse happens // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12] // reuse blocks 0, 1, 2(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8]) // nb! LlmRequest retains state calculated during addSequence, this state affects result. // Calling addSequence a second time with same LlmRequest object will produce incorrect state. // Create new llmRequest4 instance to avoid this issue. GenerationRequest seq4_dup{14, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest4 = std::make_shared( seq4_dup.getRequestId(), maxNewTokens, inputTokens4, samplingConfig, isStreaming); promptLen4 = llmRequest4->getNumTokens(beamIdx); numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq4_dup.getRequestId()); blockManager.addSequence(seq4_dup, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 2); EXPECT_THAT(seq4_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); blockManager.releaseBlocks(seq4_dup, llmRequest4); blockManager.releaseSequence(seq4_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with max size with incidental reuse of first token. // this happens because we initialize inputTokens5 with 0's. auto inputLength5 = blocksInPrimaryPool * tokensPerBlock - 1; auto inputTokens5 = std::make_shared(VecTokens(inputLength5, 0)); requestId = 5; auto llmRequest5 = std::make_shared(requestId, maxNewTokens, inputTokens5, samplingConfig, isStreaming); numTokens = llmRequest5->getNumTokens(beamIdx); GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, all blocks need to be freed auto promptLen5 = llmRequest5->getNumTokens(beamIdx); auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq5.getRequestId()); blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow); llmRequest5->addNewToken(0, beamIdx); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 1); // incidental reuse EXPECT_EQ(blockManager.getNumAllocatedBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), 0); blockManager.releaseBlocks(seq5, llmRequest5); blockManager.releaseSequence(seq5.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with min size that doesn't reuse blocks auto inputLength6 = 1; auto inputTokens6 = std::make_shared(VecTokens(inputLength6, 0)); requestId = 6; auto llmRequest6 = std::make_shared(requestId, maxNewTokens, inputTokens6, samplingConfig, isStreaming); numTokens = llmRequest6->getNumTokens(beamIdx); GenerationRequest seq6{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, all blocks need to be freed auto promptLen6 = llmRequest6->getNumTokens(beamIdx); auto numContextBlocks6 = tc::ceilDiv(promptLen6, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq6.getRequestId()); blockManager.addSequence(seq6, promptLen6, numContextBlocks6, *llmRequest6, maxAttentionWindow); llmRequest6->addNewToken(0, beamIdx); // no reuse occurs because we are unable to reuse last input token and inputLength6 == 1. EXPECT_EQ(llmRequest6->getContextCurrentPosition(), 0); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - 1); blockManager.releaseBlocks(seq6, llmRequest6); blockManager.releaseSequence(seq6.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) { // tc::Logger::getLogger()->setLevel(tc::Logger::Level::DEBUG); using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr numReturnSequences = 1; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // assume prompt id starts from 100 auto inputTokens = std::make_shared(VecTokens{100, 101, 102, 103, 104, 105, 0, 1, 2}); auto inputTokenExtraIds = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 3, 3, 0, 0, 0}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 0, 1, 2 are stored for reuse (block 2 contains [(2, 0), (3, 0)]) blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseSequence(seq0.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens and then remove it requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 3 matches block 2 and will be freed blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseSequence(seq1.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them // reuse blocks 0, 1 and get new block 4 GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest0 = std::make_shared(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0_dup.getRequestId()); blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // reuse blocks 0, 1 and reuse block 2 auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); auto inputTokenExtraIds1 = std::make_shared(*inputTokenExtraIds); inputTokenExtraIds1->push_back(0); inputTokenExtraIds1->push_back(0); GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest1 = std::make_shared(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1, numReturnSequences); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1_dup.getRequestId()); blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); blockManager.releaseBlocks(seq0_dup, llmRequest0); blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 2 is stored for reuse (block contains [(2, 0), (3, 0), (4, 0)]) blockManager.releaseBlocks(seq1_dup, llmRequest1); blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with totally different extra ids auto inputTokenExtraIds2 = std::make_shared(VecTokenExtraIds{4, 4, 5, 5, 6, 6, 0, 0, 0}); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds2, numReturnSequences); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 5, 6, 7 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest2->addNewToken(3, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // add request with partial different extra ids auto inputTokenExtraIds3 = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 4, 4, 0, 0, 0}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds3, numReturnSequences); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0, get new block 8, 9 auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq3.getRequestId()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8, 9})); llmRequest3->addNewToken(3, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseSequence(seq2.getRequestId()); blockManager.releaseSequence(seq3.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithMultimodalHashTest) { using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr numReturnSequences = 1; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // Create multimodal hash data (256-bit hash = 8 int32 values) auto multimodalHashes = std::make_shared>>(std::vector>{ {0x12345678, -0x6F543211, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666} // Hash 1 }); auto multimodalPositions = std::make_shared>(std::vector{2}); // Start at token 2 auto multimodalLengths = std::make_shared>(std::vector{4}); // Length 4 tokens // assume prompt id starts from 100 auto inputTokens = std::make_shared(VecTokens{100, 101, 102, 103, 104, 105, 0, 1, 2}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, multimodalHashes, multimodalPositions, multimodalLengths, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // Input: [100, 101, 102, 103, 104, 105, 0, 1, 2] (9 tokens) // Multimodal: starts at token 2, length 4 → [102, 103, 104, 105] // Block 0: [100, 101, 102, 103] ← Contains multimodal (102, 103) // Block 1: [104, 105, 0, 1] ← Contains multimodal (104, 105) // Block 2: [2, 3, 4] ← No multimodal blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseSequence(seq0.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens and same multimodal hash - should reuse requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, multimodalHashes, multimodalPositions, multimodalLengths, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // should reuse blocks 0, 1 and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 3 matches block 2 and will be freed blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseSequence(seq1.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // Test Case 2: Different multimodal hash requestId = 2; auto multimodalHashes2 = std::make_shared>>(std::vector>{ {0x45678123, 0x23456789, 0x34567890, 0x12121212, 0x56565656, 0x78787878, 0x54545454, 0x67676767} // Hash 2 }); auto multimodalPositions2 = std::make_shared>(std::vector{2}); // Start at token 2 auto multimodalLengths2 = std::make_shared>(std::vector{4}); // Length 4 tokens auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, multimodalHashes2, multimodalPositions2, multimodalLengths2, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences); GenerationRequest seq2{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new blocks 4, 5, 6 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({4, 5, 6})); llmRequest2->addNewToken(9, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // Test Case 3: Multiple multimodal hashes and partial reuse requestId = 3; auto multimodalHashes3 = std::make_shared>>(std::vector>{ {0x12345678, -0x6F543211, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666}, // Hash 1 {0x45678123, 0x23456789, 0x34567890, 0x12121212, 0x56565656, 0x78787878, 0x54545454, 0x67676767} // Hash 2 }); auto multimodalPositions3 = std::make_shared>(std::vector{2, 4}); // Start at token 2 and 4 auto multimodalLengths3 = std::make_shared>(std::vector{2, 2}); // Length 2 tokens auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, multimodalHashes3, multimodalPositions3, multimodalLengths3, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences); GenerationRequest seq3{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0, get new blocks 7, 8 auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq3.getRequestId()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock); // only reuse block 0 [100, 101, 102, 103] with same hash/offset EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 7, 8})); llmRequest3->addNewToken(11, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); // clean up blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseSequence(seq2.getRequestId()); blockManager.releaseSequence(seq3.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) { // tc::Logger::getLogger()->setLevel(tc::Logger::Level::DEBUG); using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // loraTaskId is 0 for common cases LlmRequest::LoraTaskIdType loraTaskId{0}; auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); // get new blocks 0, 1, 2 ([0,1,2,3], [4,5,6,7], [8]) blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(9, beamIdx); llmRequest0->addNewToken(10, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store blocks 0, 1, 2 for reuse ([0,1,2,3], [4,5,6,7], [8,9]) blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseSequence(seq0.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens and loraTaskId, then remove it // inputTokens = (0, 1, 2, 3, 4, 5, 6, 7, 8) requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, 1 and get new block 3 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 3})); llmRequest1->addNewToken(9, beamIdx); llmRequest1->addNewToken(10, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store block 3 for reuse ([8,9]) blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseSequence(seq1.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them // inputTokens = (0, 1, 2, 3, 4, 5, 6, 7, 8) // reuse blocks 0, 1 and get new block 4 GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest0 = std::make_shared(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0_dup.getRequestId()); blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); // nb! addNewToken adds new generated token, number of input tokens stay the same. // calling addNewToken before addSequence potentially triggers this error message: // Assertion failed: prepopulatedPromptLen < promptLen // because maximum value for prepopulatedPromptLen is number of input+output tokens - 1, // but promptLen is number of input tokens. llmRequest0->addNewToken(9, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // inputTokens1 = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest1 = std::make_shared(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); // reuse 0, 1, 2(p) ([0,1,2,3], [4,5,6,7], [8]) blockManager.holdSequence(seq1_dup.getRequestId()); blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(10, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); // store block 4 for reuse ([8]) blockManager.releaseBlocks(seq0_dup, llmRequest0); blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 2 is stored for reuse (block contains [8, 9]). nb! Last token of last block is not stored blockManager.releaseBlocks(seq1_dup, llmRequest1); blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 1 loraTaskId = static_cast(1); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 5, 6, 7 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); // no reuse expected. Input tokens match blocks 0 and 1, but lora task id differs. EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest2->addNewToken(9, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store blocks 5, 6, 7 for reuse ([0,1,2,3], [4,5,6,7], [8]) with loraTaskId 1 blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseSequence(seq2.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 1 and more tokens auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens3, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 5, 6, 7(p) ([0,1,2,3], [4,5,6,7], [8]) auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq3.getRequestId()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), promptLen3 - 2); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({5, 6, 7})); llmRequest3->addNewToken(11, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // store block 7 for reuse ([8,9]) with loraTaskId 1 blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseSequence(seq3.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 0 again but with less tokens loraTaskId = static_cast(0); auto inputLength4 = 5; auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 4}); requestId = 4; auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); numTokens = llmRequest4->getNumTokens(beamIdx); GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse blocks 0, get new block 8 auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq4.getRequestId()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 8})); llmRequest4->addNewToken(5, beamIdx); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 8 is stored with [4] and loraTaskId 0 blockManager.releaseBlocks(seq4, llmRequest4); blockManager.releaseSequence(seq4.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); // add request with same tokens as request0 but without loraTaskId requestId = 5; auto llmRequest5 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt); numTokens = llmRequest5->getNumTokens(beamIdx); GenerationRequest seq5{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 9, 10, 11 auto promptLen5 = llmRequest5->getNumTokens(beamIdx); auto numContextBlocks5 = tc::ceilDiv(promptLen5, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq5.getRequestId()); blockManager.addSequence(seq5, promptLen5, numContextBlocks5, *llmRequest5, maxAttentionWindow); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0); EXPECT_THAT(seq5.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({9, 10, 11})); llmRequest5->addNewToken(9, beamIdx); numTokens = llmRequest5->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 9, 10, 11 are stored without loraTaskId blockManager.releaseBlocks(seq5, llmRequest5); blockManager.releaseSequence(seq5.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) { // tc::Logger::getLogger()->setLevel(tc::Logger::Level::DEBUG); using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // assume prompt id starts from 100 auto inputTokens = std::make_shared(VecTokens{100, 101, 102, 103, 104, 105, 0, 1, 2}); auto inputTokenExtraIds = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 3, 3, 0, 0, 0}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::LoraTaskIdType loraTaskId1{1}; LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; /////////////////////////////////////////////////////////////////////////// // add request with loraTaskId 1 and then remove it auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 0, 1, 2 are stored for reuse (block 2 contains [(2, 0), (3, 0)] with loraTaskId 1) blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseSequence(seq0.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // new request with same tokens but different loraTaskId and then remove it requestId = 1; LlmRequest::LoraTaskIdType loraTaskId2 = static_cast(2); auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 3, 4, 5 auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0); EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 3, 4, 5 are stored for reuse (block 5 contains [(2, 0), (3, 0)] with loraTaskId 2) blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseSequence(seq1.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest0 = std::make_shared(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); // reuse blocks 0, 1 and get new block 6 blockManager.holdSequence(seq0_dup.getRequestId()); blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // reuse blocks 3, 4 and reuse block 5 auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); auto inputTokenExtraIds1 = std::make_shared(*inputTokenExtraIds); inputTokenExtraIds1->push_back(0); inputTokenExtraIds1->push_back(0); GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; llmRequest1 = std::make_shared(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1_dup.getRequestId()); blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); blockManager.releaseBlocks(seq0_dup, llmRequest0); blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); blockManager.releaseBlocks(seq1_dup, llmRequest1); blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // add request with totally different extra ids and loraTaskId 1 auto inputTokenExtraIds2 = std::make_shared(VecTokenExtraIds{4, 4, 5, 5, 6, 6, 0, 0, 0}); requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds2); numTokens = llmRequest2->getNumTokens(beamIdx); GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // no reuse, get new block 7, 8, 9 auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 0); EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({7, 8, 9})); llmRequest2->addNewToken(3, beamIdx); numTokens = llmRequest2->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // add request with partial different extra ids and loraTaskId 1 auto inputTokenExtraIds3 = std::make_shared(VecTokenExtraIds{1, 1, 2, 2, 4, 4, 0, 0, 0}); requestId = 3; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds3); numTokens = llmRequest3->getNumTokens(beamIdx); GenerationRequest seq3{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 0, get new block 10, 11 auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq3.getRequestId()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 10, 11})); llmRequest3->addNewToken(3, beamIdx); numTokens = llmRequest3->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); /////////////////////////////////////////////////////////////////////////// // add request with partial different extra ids and loraTaskId 2 requestId = 4; auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds3); numTokens = llmRequest4->getNumTokens(beamIdx); GenerationRequest seq4{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; // reuse block 3, get new block 12, 13 auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq4.getRequestId()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), tokensPerBlock); EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 12, 13})); llmRequest4->addNewToken(3, beamIdx); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 3); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 3); blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseBlocks(seq4, llmRequest4); blockManager.releaseSequence(seq2.getRequestId()); blockManager.releaseSequence(seq3.getRequestId()); blockManager.releaseSequence(seq4.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, BlockManagerReuseWithCacheSaltIdTest) { // Test that cache_salt_id prevents KV cache reuse between requests with same tokens // but different cache_salt_id values. using VecTokenExtraIds = LlmRequest::VecTokenExtraIds; using CacheSaltIDType = LlmRequest::CacheSaltIDType; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr numReturnSequences = 1; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // Create shared input tokens auto inputTokens = std::make_shared(VecTokens{100, 101, 102, 103, 104, 105, 106, 107, 108}); auto const inputLength = static_cast(inputTokens->size()); /////////////////////////////////////////////////////////////////////////// // Test Case 1: Request without cache_salt_id LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences, std::nullopt, std::nullopt, false, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt); // No cache_salt_id GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // Add first request and get blocks 0, 1, 2 auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); // Add generated tokens llmRequest0->addNewToken(3, beamIdx); llmRequest0->addNewToken(4, beamIdx); auto numTokens = llmRequest0->getNumTokens(beamIdx); auto numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(numBlocks, 3); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // Release blocks to make them available for reuse blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseSequence(seq0.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // Test Case 2: Request with same tokens but with cache_salt_id = 12345 requestId = 1; CacheSaltIDType cacheSaltId1{12345}; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences, std::nullopt, std::nullopt, false, std::nullopt, std::nullopt, std::nullopt, std::nullopt, cacheSaltId1); // With cache_salt_id = 12345 GenerationRequest seq1{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // Should NOT reuse blocks despite same tokens, because cache_salt_id is different auto promptLen1 = llmRequest1->getNumTokens(beamIdx); auto numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0); // No reuse, starts from scratch EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(3, beamIdx); llmRequest1->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // Release blocks blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseSequence(seq1.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // Test Case 3: Request with same tokens and same cache_salt_id = 12345 requestId = 2; auto llmRequest2 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences, std::nullopt, std::nullopt, false, std::nullopt, std::nullopt, std::nullopt, std::nullopt, cacheSaltId1); // Same cache_salt_id = 12345 GenerationRequest seq2{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // SHOULD reuse blocks because both tokens and cache_salt_id match auto promptLen2 = llmRequest2->getNumTokens(beamIdx); auto numContextBlocks2 = tc::ceilDiv(promptLen2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, promptLen2, numContextBlocks2, *llmRequest2, maxAttentionWindow); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 2 * tokensPerBlock); // Reuse blocks 3,4 EXPECT_THAT(seq2.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 6})); llmRequest2->addNewToken(3, beamIdx); llmRequest2->addNewToken(4, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // Release blocks blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseSequence(seq2.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); /////////////////////////////////////////////////////////////////////////// // Test Case 4: Request with same tokens but different cache_salt_id = 67890 requestId = 3; CacheSaltIDType cacheSaltId2{67890}; auto llmRequest3 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences, std::nullopt, std::nullopt, false, std::nullopt, std::nullopt, std::nullopt, std::nullopt, cacheSaltId2); // Different cache_salt_id = 67890 GenerationRequest seq3{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // Should NOT reuse blocks from any previous request because cache_salt_id is different auto promptLen3 = llmRequest3->getNumTokens(beamIdx); auto numContextBlocks3 = tc::ceilDiv(promptLen3, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq3.getRequestId()); blockManager.addSequence(seq3, promptLen3, numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 0); // No reuse EXPECT_THAT(seq3.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({7, 8, 9})); llmRequest3->addNewToken(5, beamIdx); llmRequest3->addNewToken(6, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); /////////////////////////////////////////////////////////////////////////// // Test Case 5: Request without cache_salt_id again requestId = 4; auto llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, std::nullopt, numReturnSequences, std::nullopt, std::nullopt, false, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt); // No cache_salt_id GenerationRequest seq4{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; // Should reuse blocks from request0 (blocks 0,1) because both have no cache_salt_id auto promptLen4 = llmRequest4->getNumTokens(beamIdx); auto numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq4.getRequestId()); blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), 2 * tokensPerBlock); // Reuse blocks 0,1 EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 10})); llmRequest4->addNewToken(7, beamIdx); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); // Clean up blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseBlocks(seq4, llmRequest4); blockManager.releaseSequence(seq3.getRequestId()); blockManager.releaseSequence(seq4.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, KVCacheManagerPerRequestStatsTest) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxSequenceLength = maxBlocksPerSeq * tokensPerBlock; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, maxSequenceLength, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); // Add the sequence to req0 EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest0)); // After first addition, check allocations and reuses auto numBlocks = tc::ceilDiv(inputLength, tokensPerBlock); EXPECT_EQ(llmRequest0->getReusedBlocksPerRequest(), 0); EXPECT_EQ(llmRequest0->getAllocTotalBlocksPerRequest(), numBlocks); EXPECT_EQ(llmRequest0->getAllocNewBlocksPerRequest(), numBlocks); EXPECT_EQ(llmRequest0->getMissedBlocksPerRequest(), numBlocks); EXPECT_NO_THROW((void) kvCacheManager.removeSequence(requestId, llmRequest0)); requestId = 1; auto llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest1)); auto const numSharedBlocks = inputLength / tokensPerBlock; EXPECT_EQ(llmRequest1->getReusedBlocksPerRequest(), numSharedBlocks); EXPECT_EQ(llmRequest1->getAllocTotalBlocksPerRequest(), numBlocks - numSharedBlocks); EXPECT_EQ(llmRequest1->getAllocNewBlocksPerRequest(), numBlocks - numSharedBlocks); EXPECT_EQ(llmRequest1->getMissedBlocksPerRequest(), numBlocks - numSharedBlocks); EXPECT_EQ(llmRequest1->getKVCacheHitRatePerRequest(), static_cast(numSharedBlocks) / static_cast(numBlocks)); } TEST_F(KVCacheManagerTest, BlockManagerBlockPriorityTest) { auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 4; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto constexpr beamWidth = 1; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxAttentionWindow, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; // Add a request at a high and very low priority auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, 4, 90), KvCacheRetentionConfig::TokenRangeRetentionConfig(4, 8, 10)}, 20)); GenerationRequest seq0{0, inputLength0, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks0 = tc::ceilDiv(inputLength0, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, llmRequest0->getNumTokens(0), numContextBlocks0, *llmRequest0, maxAttentionWindow); // Add another sequence with different tokens, at a low priority auto inputTokens1 = std::make_shared(VecTokens{8, 9, 10, 11, 12, 13, 14, 15}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); GenerationRequest seq1{1, inputLength1, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks1 = tc::ceilDiv(inputLength1, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq1.getRequestId()); blockManager.addSequence(seq1, llmRequest1->getNumTokens(0), numContextBlocks1, *llmRequest1, maxAttentionWindow); // Release both sequences blockManager.releaseBlocks(seq0, llmRequest0); blockManager.releaseBlocks(seq1, llmRequest1); blockManager.releaseSequence(seq0.getRequestId()); blockManager.releaseSequence(seq1.getRequestId()); // Add and then release another sequence auto inputTokens2 = std::make_shared(VecTokens{16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); llmRequest2->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 20)}, 20)); GenerationRequest seq2{2, inputLength2, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks2 = tc::ceilDiv(inputLength2, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq2.getRequestId()); blockManager.addSequence(seq2, llmRequest2->getNumTokens(0), numContextBlocks2, *llmRequest2, maxAttentionWindow); blockManager.releaseBlocks(seq2, llmRequest2); blockManager.releaseSequence(seq2.getRequestId()); // Check that request 1 blocks were overwritten auto inputTokens3 = std::make_shared(VecTokens{8, 9, 10, 11, 12, 13, 14, 15}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); GenerationRequest seq3{3, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks3 = tc::ceilDiv(inputLength3, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq3.getRequestId()); blockManager.addSequence(seq3, llmRequest3->getNumTokens(0), numContextBlocks3, *llmRequest3, maxAttentionWindow); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 4); blockManager.releaseBlocks(seq3, llmRequest3); blockManager.releaseSequence(seq3.getRequestId()); EXPECT_EQ(blockManager.getNumFreeBlocks(), 4); // Check that request 0 blocks weren't overwritten auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7}); auto const inputLength4 = static_cast(inputTokens4->size()); auto llmRequest4 = std::make_shared(4, maxNewTokens, inputTokens4, samplingConfig, isStreaming); GenerationRequest seq4{4, inputLength3, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks4 = tc::ceilDiv(inputLength4, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq4.getRequestId()); blockManager.addSequence(seq4, llmRequest4->getNumTokens(0), numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), 4); // Check that request 2 block 0 has been evicted auto inputTokens5 = std::make_shared(VecTokens{16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength5 = static_cast(inputTokens5->size()); auto llmRequest5 = std::make_shared(5, maxNewTokens, inputTokens5, samplingConfig, isStreaming); GenerationRequest seq5{5, inputLength5, beamWidth, blockManager.getWindowSizesMetadata()}; auto numContextBlocks5 = tc::ceilDiv(inputLength5, blockManager.getTokensPerBlock()); blockManager.holdSequence(seq5.getRequestId()); blockManager.addSequence(seq5, llmRequest5->getNumTokens(0), numContextBlocks5, *llmRequest5, maxAttentionWindow); EXPECT_EQ(llmRequest5->getContextCurrentPosition(), 0); } TEST_F(KVCacheManagerTest, KVCacheManagerDecodeBlockPriorityTest) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 8; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, maxSequenceLength, true, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 8); // Uses 3 blocks 0, 1, 2 which contain [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11] auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 90)}, 90)); kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); // 5 blocks available. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 5); // Add a token to request 0, which occupies a new block 3. kvCacheManager.addToken(0); llmRequest0->addNewToken(0, 0); // block 3 contains [0] // 4 blocks left. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 4); // uses up 3 more blocks 4, 5, 6. [12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23] auto inputTokens1 = std::make_shared(VecTokens{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); llmRequest1->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 5)}, 5)); kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); // one block left. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 1); // add another token, which occupies another new block kvCacheManager.addToken(1); llmRequest1->addNewToken(0, 0); // block 7 contains [0] // no block available. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 0); // remove both sequences, blocks get stored // leaf block 3 (priority 90), context blocks 2, 1, 0 (priority 5) (void) kvCacheManager.removeSequence(0, llmRequest0); // leaf block 7 (priority 5), context blocks 6, 5, 4 (priority 90) (void) kvCacheManager.removeSequence(1, llmRequest1); // all blocks are available again. EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 8); // no reuse, blocks are evicted by new request: // evict block 7 (lowest priority, first released block) // evict block 6 (lowest priority, second released block) // evict block 5 (lowest priority, third released block) // uses up 3 blocks 7, 6, 5. [24, 25, 26, 27], [28, 29, 30, 31], [32, 33, 34, 35] auto inputTokens2 = std::make_shared(VecTokens{24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); // leaf block 2 (priority 35), context blocks 3, 7 (priority 35) (void) kvCacheManager.removeSequence(2, llmRequest2); // Uses 3 blocks 0, 1, 2 which contain [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11] auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); // Reuse block 0, 1, and partial reuse block 2. (maximum reuse is inputLength - 1) // Two blocks reused, the third block partially reused. EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 11); } TEST_F(KVCacheManagerTest, KVCacheManagerTimedEvictionTest) { using namespace tensorrt_llm::executor; using namespace std::chrono_literals; auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, maxSequenceLength, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 80, 10ms)}, 80)); kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); (void) kvCacheManager.removeSequence(0, llmRequest0); auto inputTokens1 = std::make_shared(VecTokens{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); llmRequest1->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 50)}, 80)); kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); (void) kvCacheManager.removeSequence(1, llmRequest1); std::this_thread::sleep_for(std::chrono::milliseconds(50)); // Manually trigger a refresh. kvCacheManager.refreshBlocks(); // Clear out some of the blocks. auto inputTokens2 = std::make_shared(VecTokens{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); (void) kvCacheManager.removeSequence(2, llmRequest2); // Check that the [12, 13, 14, 15] block is still in the cache auto inputTokens3 = std::make_shared(VecTokens{12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 11); } TEST_F(KVCacheManagerTest, KVCacheManagerDecodeTimedEvictionTest) { using namespace tensorrt_llm::executor; using namespace std::chrono_literals; auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 9; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, maxSequenceLength, true, onboardBlocks); kvCacheManager.allocatePools(false); { // 12 tokens, occupy 3 blocks 0, 1, 2. // [1, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11] auto inputTokens0 = std::make_shared(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); llmRequest0->setKvCacheRetentionConfig( KvCacheRetentionConfig({KvCacheRetentionConfig::TokenRangeRetentionConfig(0, std::nullopt, 50)}, 50)); // Set all blocks to priority 50. kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); // Occupy a new block, block 3, adding 3 tokens to block 3. // [1, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [0, 0, 0] for (int i = 0; i < 3; i++) { kvCacheManager.addToken(0); llmRequest0->addNewToken(0, 0); } (void) kvCacheManager.removeSequence(0, llmRequest0); } { // 12 tokens, occupy 3 blocks 4, 5, 6. // [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11] auto inputTokens1 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); llmRequest1->setKvCacheRetentionConfig(KvCacheRetentionConfig( {}, KvCacheRetentionConfig::kMaxRetentionPriority, 20ms)); // Set decode blocks to max priority for 20ms. kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); kvCacheManager.storeContextBlocks(*llmRequest1); // Occupy a new block, block 3, adding 3 tokens to block 3. // [1, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [0, 0, 0] for (int i = 0; i < 3; i++) { kvCacheManager.addToken(1); llmRequest1->addNewToken(0, 0); } (void) kvCacheManager.removeSequence(1, llmRequest1); } std::this_thread::sleep_for(std::chrono::milliseconds(50)); kvCacheManager.refreshBlocks(); // 8 tokens, occupying blocks 8, 6 auto inputTokens2 = std::make_shared(VecTokens{0, 0, 0, 0, 0, 0, 0, 0}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); (void) kvCacheManager.removeSequence(2, llmRequest2); // 12 tokens, reusing block 4, 5. Block 6 is overwritten so no reuse. auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 8); } TEST_F(KVCacheManagerTest, KVCacheManagerSecondaryBlockPrimaryChildTest) { // It's possible for a block in secondary memory to have a primary child. Make sure this case is handled. auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 4; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, maxSequenceLength, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); // 12 tokens, get block 0, 1, 2 // [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11] kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); (void) kvCacheManager.removeSequence(0, llmRequest0); // store blocks 0, 1, 2 for reuse ([0,1,2,3], [4,5,6,7], [8,9,10]) // Offload the last two blocks of llmRequest0 to secondary memory auto inputTokens1 = std::make_shared(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); // Uses blocks 3, 4, 5, block 2 and 1 to be offloaded to secondary // Block 4 is now in primary (replacing 2) // Block 5 is now in primary (replacing 1) kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); (void) kvCacheManager.removeSequence(1, llmRequest1); // store blocks 3, 4, 5 for reuse ([1,1,2,3], [4,5,6,7], [8,9,10]) // Match the middle block of request 0 // Uses block 6, block 0 is offloaded to secondary // Block 6 copies content from block 0 to itselg. auto inputTokens2 = std::make_shared(VecTokens{0, 1, 2, 3}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); // reuse block 0 kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); EXPECT_EQ(llmRequest2->getContextCurrentPosition(), 3); kvCacheManager.storeContextBlocks(*llmRequest2); // Add a decode block that matches the contents of seq 0 block 1, add a unique decode block // The 4 tokens added has the same content as block 1. for (int token = 4; token < 8; token++) { llmRequest2->addNewToken(token, 0); kvCacheManager.addToken(2); } // Add 2 more tokens, occupying another block llmRequest2->addNewToken(0, 0); kvCacheManager.addToken(2); llmRequest2->addNewToken(0, 0); kvCacheManager.addToken(2); // The middle block remains in secondary, but the third block is in primary // FIXME: When removing the sequence, we should observe whether released // blocks can replace itself as the block reused in the search tree if // the matching block is currently in secondary memory. We can release the // block in secondary if so. // If we do this, then the context current position at the bottom of this // unit test will be 9 because then the block content [4,5,6,7] can be // found and reused. (void) kvCacheManager.removeSequence(2, llmRequest2); // 10 tokens, reusing the block 0 only because when we want to acquire // the second block, contents of block 3 will be offloaded to block 1. auto inputTokens3 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 0, 0}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); // Check out FIXME note above. If addressed, this should be 9. EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 4); } TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockTest) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 0; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, maxSequenceLength, true, onboardBlocks); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); kvCacheManager.addSequence(0, inputLength0, beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); llmRequest0->addNewToken(0, 0); kvCacheManager.addToken(0); // The second block allocated should be first in line for eviction. (void) kvCacheManager.removeSequence(0, llmRequest0); auto inputTokens1 = std::make_shared(VecTokens{1, 1, 2, 3}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); kvCacheManager.addSequence(1, inputLength1, beamWidth, llmRequest1); GenerationRequest const& seq1 = kvCacheManager.getSequence(1); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 0); // Block 1 should NOT be reused. It was not freed even if partial. EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(0), ::testing::ElementsAreArray({2})); // Allocate the remaining 3 blocks in primary auto inputTokens2 = std::make_shared(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength2 = static_cast(inputTokens2->size()); auto llmRequest2 = std::make_shared(2, maxNewTokens, inputTokens2, samplingConfig, isStreaming); kvCacheManager.addSequence(2, inputLength2, beamWidth, llmRequest2); (void) kvCacheManager.removeSequence(1, llmRequest1); (void) kvCacheManager.removeSequence(2, llmRequest2); auto inputTokens3 = std::make_shared(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); auto const inputLength3 = static_cast(inputTokens3->size()); auto llmRequest3 = std::make_shared(3, maxNewTokens, inputTokens3, samplingConfig, isStreaming); kvCacheManager.addSequence(3, inputLength3, beamWidth, llmRequest3); EXPECT_EQ(llmRequest3->getContextCurrentPosition(), 11); EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 1); kvCacheManager.addToken(3); llmRequest3->addNewToken(0, 0); EXPECT_EQ(kvCacheManager.getNumFreeBlocks(), 0); auto inputTokens4 = std::make_shared(VecTokens{42}); auto const inputLength4 = static_cast(inputTokens4->size()); auto llmRequest4 = std::make_shared(4, maxNewTokens, inputTokens4, samplingConfig, isStreaming); EXPECT_THROW(kvCacheManager.addSequence(4, inputLength4, beamWidth, llmRequest4), std::exception); } TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockWithDependentTest) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 4; auto constexpr blocksInSecondaryPool = 1; auto constexpr onboardBlocks = true; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; auto constexpr beamIdx = 0; SizeType32 constexpr maxNewTokens = 8; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, 0, stream, maxSequenceLength, true, onboardBlocks); kvCacheManager.allocatePools(false); // Create sequence with one block worth of context tokens int requestId0 = 0; auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3}); auto const inputLength0 = static_cast(inputTokens0->size()); auto llmRequest0 = std::make_shared(requestId0, maxNewTokens, inputTokens0, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId0, inputLength0, beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId0); // Add two more blocks of generated tokens for (int i = tokensPerBlock; i < 3 * tokensPerBlock; ++i) { llmRequest0->addNewToken(i, beamIdx); kvCacheManager.addToken(requestId0); } // Verify auto cacheBlockIds0 = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds0, ::testing::ElementsAreArray({0, 1, 2})); // Raise priority of middle block to prevent offloading auto const& blockManager = kvCacheManager.getBlockManager(); auto middleBlock = blockManager.getBlockById(cacheBlockIds0[1], maxAttentionWindow); middleBlock->setPriority(75); // Create another sequence with one block worth of context tokens (no reuse). // 4 tokens, occupying block 3 int requestId1 = 1; auto inputTokens1 = std::make_shared(VecTokens{100, 101, 102, 103}); auto const inputLength1 = static_cast(inputTokens1->size()); auto llmRequest1 = std::make_shared(requestId1, maxNewTokens, inputTokens1, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId1, inputLength1, beamWidth, llmRequest1); kvCacheManager.storeContextBlocks(*llmRequest1); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId1); // Verify that all primary blocks are in use EXPECT_EQ(blockManager.getNumFreeBlocks(), 0); // Free first sequence (void) kvCacheManager.removeSequence(requestId0, llmRequest0); // Verify that 3 primary blocks are free. // Since block 1 has higher priority, block 2 and 0 will be used first. EXPECT_EQ(blockManager.getNumFreeBlocks(), 3); // Write one generated token to second sequence. This will prompt block 2 to be offloaded. // Block 4 will be in primary (replacing block 2) llmRequest1->addNewToken(104, beamIdx); kvCacheManager.addToken(requestId1); // Verify that block 2 has block 1 as parent auto block2 = blockManager.getBlockById(2, maxAttentionWindow); EXPECT_TRUE(block2->getPrevBlock() != nullptr); EXPECT_EQ(block2->getPrevBlock()->getBlockId(), 1); EXPECT_FALSE(block2->isPrimary()); // Fill block for (int i = 101 + tokensPerBlock; i < 100 + 2 * tokensPerBlock; ++i) { llmRequest1->addNewToken(i, beamIdx); kvCacheManager.addToken(requestId1); } // Verify auto cacheBlockIds1 = seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(cacheBlockIds1, ::testing::ElementsAreArray({3, 4})); // Write one generated token to second sequence. This will prompt block 0 to be offloaded, // replacing block 2. llmRequest1->addNewToken(100 + 2 * tokensPerBlock, beamIdx); kvCacheManager.addToken(requestId1); // Verify that block 2 is free, has no parent EXPECT_EQ(block2->getPrevBlock(), nullptr); // Verify that it is block 0 that is in secondary auto block0 = blockManager.getBlockById(0, maxAttentionWindow); EXPECT_FALSE(block0->isPrimary()); // Cleanup (void) kvCacheManager.removeSequence(requestId1, llmRequest1); } TEST_P(KVCacheManagerTest, DISABLED_KVCacheManagerAllocationTest) { using DType = half; auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 4; auto constexpr sinkTokenLength = 0; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxSequenceLength; auto constexpr inputLength = maxSequenceLength - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr useUvm = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const granularity = tensorrt_llm::common::getAllocationGranularity(); auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {totalNumBlocks, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks) : KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); auto const& bufferManager = blockManager.getBufferManager(theOnlyWindowSize(kvCacheManager)); auto const memoryPoolUsedBefore = bufferManager.memoryPoolUsed(); kvCacheManager.allocatePools(useUvm); auto const memoryPoolUsedAfter = bufferManager.memoryPoolUsed(); EXPECT_GT(memoryPoolUsedAfter, memoryPoolUsedBefore); auto const actualPoolMemoryUsageDiff = static_cast(memoryPoolUsedAfter - memoryPoolUsedBefore); auto const expectedPoolMemoryUsageDiff = tensorrt_llm::common::roundUp( sizeof(DType) * static_cast(totalNumBlocks) * numLayers * 2 * numHeads * tokensPerBlock * sizePerHead, static_cast(granularity)); EXPECT_EQ(actualPoolMemoryUsageDiff, expectedPoolMemoryUsageDiff); } TEST_P(KVCacheManagerTest, KVCacheManagerTest) { // Full attention using DType = half; using SizeType32 = KVCacheManager::SizeType32; auto constexpr numLayers = 4; auto constexpr numHeads = 2; // heterogeneous layers std::vector const numHeadsPerLayer({3, 2, 1, 2}); std::map const expectedHeadsPerPool({{0, 1}, {1, 2}, {2, 3}}); std::map const expectedLayersPerPool({{0, 1}, {1, 2}, {2, 1}}); auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 4; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr requestId = 7; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxSequenceLength; auto constexpr inputLength = maxSequenceLength - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const expectedNumPools = homogeneousLayers ? 1 : static_cast(expectedHeadsPerPool.size()); auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {totalNumBlocks, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); tr::ITensor::SharedPtr kvCacheBlockOffsets = tr::BufferManager::cpu( tr::ITensor::makeShape({expectedNumPools, maxNumSequences * maxBeamWidth, 2, maxBlocksPerSeq}), tr::TRTDataType::value); auto kvCacheBlockOffsetsRange = tensorrt_llm::runtime::BufferRange(*kvCacheBlockOffsets); std::fill(kvCacheBlockOffsetsRange.begin(), kvCacheBlockOffsetsRange.end(), tk::KVCacheIndex{std::numeric_limits::max()}); kvCacheManager.copyBlockOffsets(*kvCacheBlockOffsets, 0, requestId); EXPECT_EQ(blockManager.getNumPools(), expectedNumPools); if (homogeneousLayers) { EXPECT_EQ(blockManager.getBlockSize(0), numHeads * sizePerHead * tokensPerBlock); } else { for (auto layerIdx = 0; layerIdx < numLayers; layerIdx++) { EXPECT_EQ(blockManager.getBlockSize(blockManager.getLayerPoolIdx(layerIdx)), numHeadsPerLayer.at(layerIdx) * sizePerHead * tokensPerBlock); } } { for (auto poolIdx = 0; poolIdx < blockManager.getNumPools(); poolIdx++) { auto const numLayersInPool = homogeneousLayers ? numLayers : expectedLayersPerPool.at(poolIdx); auto const offsetBetweenBlocks = numLayersInPool * 2; tr::ITensor::SharedPtr blockOffsetsSlice = tr::ITensor::slice(tr::ITensor::at(kvCacheBlockOffsets, {poolIdx}), 0, maxBeamWidth); auto blockOffsetsShape = blockOffsetsSlice->getShape(); auto* const blockOffsetsPtr = tr::bufferCast(*blockOffsetsSlice); tk::KVCacheIndex::UnderlyingType runningSum{0}; for (auto block = 0; block < numSharedBlocks; ++block) { for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; } runningSum += offsetBetweenBlocks; } { auto const block = numSharedBlocks; for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; runningSum += offsetBetweenBlocks; } } { auto const block = numSharedBlocks + 1; auto const expected = std::numeric_limits::max(); for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), expected) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), expected) << "beam:" << beam << " block:" << block; } } } } EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth * 2); EXPECT_NO_THROW((void) kvCacheManager.removeSequence(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); auto currentNumBlocks = totalNumBlocks; for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); currentNumBlocks -= numSharedBlocks + maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); currentNumBlocks -= maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } EXPECT_EQ(blockManager.getNumFreeBlocks(), 0); } TEST_P(KVCacheManagerTest, KVCacheManagerRewindTokensTest) { using DType = half; auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr hiddenSize = numHeads * sizePerHead; auto constexpr tokensPerBlock = 64; auto constexpr maxBlocksPerSeq = 10; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr requestId = 7; auto constexpr maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto constexpr maxAttentionWindow = maxSequenceLength; auto constexpr inputLength = maxSequenceLength - tokensPerBlock - 1; auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (maxBlocksPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {totalNumBlocks, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks) : KVCacheManager(std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numBlocksPerSeq); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numBlocksPerSeq); EXPECT_NO_THROW(kvCacheManager.rewindKVCache(requestId, 4)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW((void) kvCacheManager.removeSequence(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); auto currentNumBlocks = totalNumBlocks; for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); currentNumBlocks -= numSharedBlocks + maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); currentNumBlocks -= maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.rewindKVCache(requestId, 2)); currentNumBlocks += maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } } TEST_P(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowTest) { using DType = half; using SizeType32 = KVCacheManager::SizeType32; auto constexpr numLayers = 4; auto constexpr numHeads = 2; // heterogeneous layers std::vector const numHeadsPerLayer({3, 2, 1, 2}); std::map const expectedHeadsPerPool({{0, 1}, {1, 2}, {2, 3}}); std::map const expectedLayersPerPool({{0, 1}, {1, 2}, {2, 1}}); auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 64; auto constexpr blockLengthPerSeq = 10; auto constexpr maxNumSequences = 8; // TODO: Support and add coverage for beamWidth > 1 auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr requestId = 7; auto constexpr maxSequenceLength = tokensPerBlock * blockLengthPerSeq; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxSequenceLength, tokensPerBlock); auto constexpr inputLength = maxSequenceLength - tokensPerBlock - 1; auto constexpr maxAttentionWindow = inputLength; // sliding window attention auto constexpr numSharedBlocks = inputLength / tokensPerBlock; auto constexpr numBlocksPerSeq = numSharedBlocks + (blockLengthPerSeq - numSharedBlocks) * maxBeamWidth; auto constexpr totalNumBlocks = maxNumSequences * numBlocksPerSeq; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const homogeneousLayers = GetParam(); auto const expectedNumPools = homogeneousLayers ? 1 : static_cast(expectedHeadsPerPool.size()); auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {totalNumBlocks, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager = homogeneousLayers ? KVCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks) : KVCacheManager(numHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); EXPECT_EQ(kvCacheManager.getOffsetTableDimensions().maxBlocksPerSeq, maxBlocksPerSeq); EXPECT_EQ(kvCacheManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(kvCacheManager.getMaxNumBlocks(), totalNumBlocks); auto const& blockManager = kvCacheManager.getBlockManager(); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); tr::ITensor::SharedPtr const kvCacheBlockOffsets = tr::BufferManager::cpu( tr::ITensor::makeShape({expectedNumPools, maxNumSequences * maxBeamWidth, 2, maxBlocksPerSeq}), tr::TRTDataType::value); kvCacheManager.copyBlockOffsets(*kvCacheBlockOffsets, 0, requestId); EXPECT_EQ(blockManager.getNumPools(), expectedNumPools); if (homogeneousLayers) { EXPECT_EQ(blockManager.getBlockSize(0), numHeads * sizePerHead * tokensPerBlock); } else { for (auto layerIdx = 0; layerIdx < numLayers; layerIdx++) { EXPECT_EQ(blockManager.getBlockSize(blockManager.getLayerPoolIdx(layerIdx)), numHeadsPerLayer.at(layerIdx) * sizePerHead * tokensPerBlock); } } for (auto poolIdx = 0; poolIdx < blockManager.getNumPools(); poolIdx++) { auto const numLayersInPool = homogeneousLayers ? numLayers : expectedLayersPerPool.at(poolIdx); auto const offsetBetweenBlocks = numLayersInPool * 2; tr::ITensor::SharedPtr const blockOffsetsSlice = tr::ITensor::slice(tr::ITensor::at(kvCacheBlockOffsets, {poolIdx}), 0, maxBeamWidth); auto blockOffsetsShape = blockOffsetsSlice->getShape(); auto* const blockOffsetsPtr = tr::bufferCast(*blockOffsetsSlice); tk::KVCacheIndex::UnderlyingType runningSum{0}; for (auto block = 0; block < numSharedBlocks; ++block) { for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; } runningSum += offsetBetweenBlocks; } { auto const block = numSharedBlocks; for (auto beam = 0; beam < maxBeamWidth; ++beam) { auto const kOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 0, block); auto const vOffsetIdx = tc::flat_index(blockOffsetsShape.d, beam, 1, block); auto const kOffset = blockOffsetsPtr[kOffsetIdx]; auto const vOffset = blockOffsetsPtr[vOffsetIdx]; EXPECT_EQ(kOffset.get(), runningSum) << "beam:" << beam << " block:" << block; ASSERT_EQ(vOffset.get(), runningSum + 1) << "beam:" << beam << " block:" << block; runningSum += offsetBetweenBlocks; } } } EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks - numSharedBlocks - maxBeamWidth * 2); EXPECT_NO_THROW((void) kvCacheManager.removeSequence(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), totalNumBlocks); auto currentNumBlocks = totalNumBlocks; for (auto requestId = 0; requestId < maxNumSequences; ++requestId) { EXPECT_NO_THROW(kvCacheManager.addSequence(requestId, inputLength, maxBeamWidth)); currentNumBlocks -= numSharedBlocks + maxBeamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); EXPECT_NO_THROW(kvCacheManager.addToken(requestId)); EXPECT_EQ(blockManager.getNumFreeBlocks(), currentNumBlocks); } EXPECT_EQ(blockManager.getNumFreeBlocks(), maxNumSequences); } TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowSmallerThanBlockSizeTest) { auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 4; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr maxSequenceLength = 128; // Enable sliding window kv cache for long input tokens. auto constexpr maxAttentionWindow = 3; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxSequenceLength, tokensPerBlock); auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 0; auto constexpr enableBlockReuse = false; auto constexpr onboardBlocks = true; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); SizeType32 constexpr maxNewTokens = 40; // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = maxBeamWidth; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; /////////////////////////////////////////////////////////////////////////// // add a request that starts shorter and gets longer than the max attention window and then remove it SizeType32 requestId = 0; int inputLength = 2; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); auto constexpr beamIdx = 0; kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0})); // add tokens, reaching max attention window llmRequest->addNewToken(1002, beamIdx); kvCacheManager.addToken(requestId); auto numBlocks = seq0.getCacheBlockIds(onlyWindowSize)[beamIdx].size(); EXPECT_EQ(numBlocks, 1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0})); // add new tokens exceeding max attention window, but not enough to allocate another block llmRequest->addNewToken(1003, beamIdx); kvCacheManager.addToken(requestId); numBlocks = seq0.getCacheBlockIds(onlyWindowSize)[beamIdx].size(); EXPECT_EQ(numBlocks, 1); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0})); // add more new tokens, enough to allocate a new block but not enough to detach block llmRequest->addNewToken(1004, beamIdx); kvCacheManager.addToken(requestId); numBlocks = seq0.getCacheBlockIds(onlyWindowSize)[beamIdx].size(); EXPECT_EQ(numBlocks, 2); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1})); // add more new tokens, enough to detach block without allocating a new one llmRequest->addNewToken(1005, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1006, beamIdx); kvCacheManager.addToken(requestId); numBlocks = seq0.getCacheBlockIds(onlyWindowSize)[beamIdx].size(); EXPECT_EQ(numBlocks, 2); EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1})); // add more new tokens, to allocate a new block llmRequest->addNewToken(1007, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); numBlocks = seq0.getCacheBlockIds(onlyWindowSize)[beamIdx].size(); EXPECT_EQ(numBlocks, 3); EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); EXPECT_NO_THROW((void) kvCacheManager.removeSequence(requestId, llmRequest)); // no blocks stored because reuse is disabled EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); } TEST_F(KVCacheManagerTest, KVCacheManagerEventStream) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 2; auto constexpr onboardBlocks = true; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto const maxAttentionWindow = maxSequenceLength; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, dtype, 0, stream, maxSequenceLength, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1024)); kvCacheManager.allocatePools(false); auto events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_THAT(std::get(events.front().data).numBlocksPerCacheLevel, ::testing::ElementsAreArray({8, 2})); EXPECT_EQ(getEvents(kvCacheManager).size(), 0); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); llmRequest0->setLoraTaskId(42); kvCacheManager.addSequence(0, inputTokens0->size(), beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get(events.front().data).parentHash, std::nullopt); EXPECT_EQ(std::get(events.front().data).blocks.size(), 2); EXPECT_EQ(std::get(events.front().data).blocks[0].loraId, 42); EXPECT_EQ(std::get(events.front().data).blocks[0].cacheLevel, 0); kvCacheManager.addToken(0); llmRequest0->addNewToken(0, 0); (void) kvCacheManager.removeSequence(0, llmRequest0); auto newEvents = getEvents(kvCacheManager); EXPECT_EQ(newEvents.size(), 1); EXPECT_EQ(newEvents.front().eventId, events.front().eventId + 1); EXPECT_TRUE(std::holds_alternative(newEvents.front().data)); // Check that it provides the link to the parent, and that it only stores the one block. auto storedEventData = std::get(newEvents.front().data); EXPECT_EQ( storedEventData.parentHash, std::get(events.front().data).blocks.back().blockHash); EXPECT_EQ(storedEventData.blocks.size(), 1); // Store with the same tokens auto inputTokens1 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto llmRequest1 = std::make_shared(1, 0, inputTokens1, samplingConfig, true); llmRequest1->setLoraTaskId(42); kvCacheManager.addSequence(1, inputTokens1->size(), beamWidth, llmRequest1); kvCacheManager.storeContextBlocks(*llmRequest1); (void) kvCacheManager.removeSequence(1, llmRequest1); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 0); auto inputTokens2 = std::make_shared(VecTokens{1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); auto llmRequest2 = std::make_shared(2, 0, inputTokens2, samplingConfig, true); kvCacheManager.addSequence(2, inputTokens2->size(), beamWidth, llmRequest2); auto inputTokens3 = std::make_shared(VecTokens{2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); auto llmRequest3 = std::make_shared(3, 0, inputTokens3, samplingConfig, true); kvCacheManager.addSequence(3, inputTokens3->size(), beamWidth, llmRequest3); events = getEvents(kvCacheManager); size_t firstSwapped = std::get(events.front().data).blockHash; // 3 blocks swapped to secondary EXPECT_EQ(events.size(), 4); for (int i = 2; i >= 0; i--) { EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get(events.front().data).cacheLevel->oldValue, 0); EXPECT_EQ(std::get(events.front().data).cacheLevel->newValue, 1); events.pop_front(); } // The first block swapped to secondary gets evicted. EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_THAT(std::get(events.front().data).blockHashes, ::testing::ElementsAreArray({firstSwapped})); (void) kvCacheManager.removeSequence(2, llmRequest2); (void) kvCacheManager.removeSequence(3, llmRequest3); events = getEvents(kvCacheManager); EXPECT_EQ(events.size(), 2); for (int i = 0; i < 2; i++) { EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get(events.front().data).parentHash, std::nullopt); EXPECT_EQ(std::get(events.front().data).blocks.size(), 3); events.pop_front(); } auto inputTokens4 = std::make_shared(VecTokens{0, 1, 2, 3, 1, 1, 1, 1, 0}); auto llmRequest4 = std::make_shared(4, 0, inputTokens4, samplingConfig, true); llmRequest4->setLoraTaskId(42); kvCacheManager.addSequence(4, inputTokens4->size(), beamWidth, llmRequest4); events = getEvents(kvCacheManager); // Onboard block 0, in replace, offload block 7 // Offload block 6, and write content of [1,1,1,1] to block 1 // Upon freeing up block 1, its child block 2, will be removed from the search tree, // which is a remove event. // Offload block 5, in replace onboard block 7, and write content of [0] to block 7. // In total, there are 2 offloads, 1 onboard, 1 removed, total of 4 events. // FIXME: For better improvement, when block 1 is overwritten, child blocks // are removed from the search tree and no longer reusable. Therefore these blocks // should be the first to be called upon when we want a new block. auto onboardedBlocks = 0; auto offloadedBlocks = 0; auto removedBlocks = 0; EXPECT_EQ(events.size(), 4); for (int i = 0; i < 4; i++) { if (std::holds_alternative(events.front().data)) { if (std::get(events.front().data).cacheLevel->oldValue == 0) offloadedBlocks++; else onboardedBlocks++; } else if (std::holds_alternative(events.front().data)) removedBlocks++; else FAIL(); events.pop_front(); } EXPECT_EQ(onboardedBlocks, 1); EXPECT_EQ(offloadedBlocks, 2); EXPECT_EQ(removedBlocks, 1); kvCacheManager.storeContextBlocks(*llmRequest4); events = getEvents(kvCacheManager); EXPECT_TRUE(std::holds_alternative(events.front().data)); } TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest) { auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 4; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto const stream = std::make_shared(); auto constexpr maxSequenceLength = 128; // Enable sliding window kv cache for long input tokens. auto constexpr maxAttentionWindow = 16; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxSequenceLength, tokensPerBlock); auto constexpr blocksInPrimaryPool = 16; auto constexpr blocksInSecondaryPool = 16; auto constexpr enableBlockReuse = true; auto constexpr onboardBlocks = true; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, std::vector{maxAttentionWindow}, std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); SizeType32 constexpr maxNewTokens = 40; // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = maxBeamWidth; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; /////////////////////////////////////////////////////////////////////////// // add a request just at the attention window and then remove it SizeType32 requestId = 0; int inputLength = 16; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); auto constexpr beamIdx = 0; kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3})); // add tokens, making the window slide llmRequest->addNewToken(1016, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1017, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1018, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1019, beamIdx); kvCacheManager.addToken(requestId); auto numTokens = llmRequest->getNumTokens(beamIdx); auto numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPool; EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3, 4})); EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(requestId, llmRequest))); numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPool; EXPECT_EQ(numAllocatedPrimaryBlocks, 0); // store blocks 0, 1, 2, 3, 4 for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007], [1008,1009,1010,1011], // [1012,1013,1014,1015], [1016,1017]) /////////////////////////////////////////////////////////////////////////// // add a short request and then remove it // reuse first 2 blocks {0, 1(p)} in previous request, copying block 1 to a new block 5 requestId = 1; inputLength = 7; inputTokens->resize(inputLength); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 6); EXPECT_THAT(seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray( {0, 5})); // Can't use 5 since it's used to onboard block, so 6 is the next free block. llmRequest->addNewToken(1007, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); numTokens = llmRequest->getNumTokens(beamIdx); EXPECT_THAT(seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 5, 6})); EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(requestId, llmRequest))); /////////////////////////////////////////////////////////////////////////// // add a medium request and then remove it // reuse first 3 blocks {0, 1, 2(p)} in first request, copying block 2 to a new block 7 requestId = 2; inputLength = 10; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 9); EXPECT_THAT(seq2.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 7})); EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(requestId, llmRequest))); /////////////////////////////////////////////////////////////////////////// // add a longer request within attention window and try to reuse // reuse blocks {0, 1, 2, 3(p)}, copying block 3 to a new block 8 // then upon reaching attention window, get new block 9 requestId = 3; inputLength = 15; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq3 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 14); EXPECT_THAT(seq3.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 8})); // add new tokens to allocate another block, but not enough to detach block llmRequest->addNewToken(1015, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1016, beamIdx); kvCacheManager.addToken(requestId); EXPECT_THAT(seq3.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 8, 9})); EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(requestId, llmRequest))); } TEST_F(KVCacheManagerTest, KVCacheManagerSWAInvalidateReuseTest) { auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 4; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr maxSequenceLength = 128; SizeType32 constexpr maxNewTokens = 40; // Enable sliding window kv cache for long input tokens. auto constexpr attentionWindow = 8; auto constexpr numWindows = 1; auto const maxAttentionWindowVec = std::vector{attentionWindow}; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxSequenceLength, tokensPerBlock); auto constexpr blocksInPrimaryPoolPerWindow = 4; auto constexpr blocksInSecondaryPoolPerWindow = 0; auto constexpr enableBlockReuse = true; auto constexpr onboardBlocks = true; auto const blocksPerWindow = BlocksPerWindow{{attentionWindow, {blocksInPrimaryPoolPerWindow, blocksInSecondaryPoolPerWindow}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, std::nullopt, dtype, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = maxBeamWidth; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto constexpr beamIdx = 0; /////////////////////////////////////////////////////////////////////////// // add a request more than attention window size, and add token to trigger // detach. int inputLength = 11; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest0 = std::make_shared(/*requestId=*/0, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(/*requestId=*/0, inputLength, beamWidth, llmRequest0); GenerationRequest const& seq0 = kvCacheManager.getSequence(/*requestId=*/0); // Block 0 goes out-of-window and is detached llmRequest0->addNewToken(1011, beamIdx); kvCacheManager.addToken(/*requestId=*/0); /////////////////////////////////////////////////////////////////////////// // add the second request that will take 2 blocks. Since we only have 4 // blocks in the primary block pool and no secondary block pool, we will // need 1 block originally written by the previous sequence. Acquiring // the block means that the previous sequence is not valid to store for // reuse. inputLength = 8; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest1 = std::make_shared(/*requestId=*/1, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(/*requestId=*/1, inputLength, beamWidth, llmRequest1); GenerationRequest const& seq1 = kvCacheManager.getSequence(/*requestId=*/1); auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager); EXPECT_FALSE(blockManager.isSequenceValidForStoreForReuse(seq0.getRequestId(), onlyWindowSize)); EXPECT_TRUE(blockManager.isSequenceValidForStoreForReuse(seq1.getRequestId(), onlyWindowSize)); EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(seq0.getRequestId(), llmRequest0))); EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(seq1.getRequestId(), llmRequest1))); } TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest) { auto constexpr numLayers = 2; auto constexpr numHeads = 2; auto constexpr sizePerHead = 64; auto constexpr tokensPerBlock = 4; auto constexpr maxNumSequences = 8; auto constexpr maxBeamWidth = 1; auto constexpr sinkTokenLength = 0; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr maxSequenceLength = 128; // Enable sliding window kv cache for long input tokens. auto constexpr minAttentionWindow = 8; auto constexpr maxAttentionWindow = 16; auto constexpr numWindows = 2; auto const maxAttentionWindowVec = std::vector{maxAttentionWindow, minAttentionWindow}; auto constexpr maxBlocksPerSeq = tc::ceilDiv(maxSequenceLength, tokensPerBlock); auto constexpr blocksInPrimaryPoolPerWindow = 16; auto constexpr blocksInSecondaryPoolPerWindow = 16; auto constexpr enableBlockReuse = true; auto constexpr onboardBlocks = true; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPoolPerWindow, blocksInSecondaryPoolPerWindow}}, {minAttentionWindow, {blocksInPrimaryPoolPerWindow, blocksInSecondaryPoolPerWindow}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, maxBeamWidth, maxAttentionWindowVec, std::nullopt, dtype, sinkTokenLength, stream, maxSequenceLength, enableBlockReuse, onboardBlocks); kvCacheManager.allocatePools(false); auto const& blockManager = kvCacheManager.getBlockManager(); auto const allBlocksInPrimaryPools = blockManager.getNumPrimaryBlocks(); EXPECT_THAT(allBlocksInPrimaryPools, blocksInPrimaryPoolPerWindow * numWindows); ASSERT_EQ(blockManager.isVariableWindow(), true); ASSERT_EQ(blockManager.isVariableGQA(), false); SizeType32 constexpr maxNewTokens = 40; // prepare tokens with token[i] = 1000 + i TokenIdType constexpr firstToken = 1000; auto constexpr beamWidth = maxBeamWidth; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto constexpr beamIdx = 0; auto const assertBlocks = [minAttentionWindow, maxAttentionWindow, beamIdx](GenerationRequest seq, std::initializer_list expectedBlocksMin, std::initializer_list expectedBlocksMax) { auto blocksMin = seq.getCacheBlockIds(minAttentionWindow).at(beamIdx); auto blocksMax = seq.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(blocksMin, ::testing::ElementsAreArray(expectedBlocksMin)); EXPECT_THAT(blocksMax, ::testing::ElementsAreArray(expectedBlocksMax)); }; /////////////////////////////////////////////////////////////////////////// // add a request just at the minimum attention window and then remove it SizeType32 requestId = 0; int inputLength = 8; auto inputTokens = std::make_shared(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); auto llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0); assertBlocks(seq0, {0, 1}, {0, 1}); // add tokens, making the minimum attention window slide (not reaching the max attention window) llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1009, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1010, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1011, beamIdx); kvCacheManager.addToken(requestId); assertBlocks(seq0, {0, 1, 2}, {0, 1, 2}); auto numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPoolPerWindow * numWindows; EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(requestId, llmRequest))); numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPoolPerWindow * numWindows; EXPECT_EQ(numAllocatedPrimaryBlocks, 0); // For both windows, store blocks 0, 1, 2 for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007], // [1008,1009,1010,1011]) /////////////////////////////////////////////////////////////////////////// // add a short request within both attention windows and try to reuse // reuse blocks {0, 1(p)} for both windows, copying block 1 to a new block 4 since it's not a leaf block and is // partially used. upon reached attention window, get new block 5 requestId = 1; inputLength = 7; inputTokens->resize(inputLength); std::iota(inputTokens->begin(), inputTokens->end(), firstToken); llmRequest = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest); GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId); EXPECT_EQ(llmRequest->getContextCurrentPosition(), 6); assertBlocks(seq1, {0, 3}, {0, 3}); // add new tokens to allocate another block, but not enough to detach block llmRequest->addNewToken(1008, beamIdx); kvCacheManager.addToken(requestId); llmRequest->addNewToken(1009, beamIdx); kvCacheManager.addToken(requestId); assertBlocks(seq1, {0, 3, 4}, {0, 3, 4}); EXPECT_NO_THROW(static_cast(kvCacheManager.removeSequence(requestId, llmRequest))); } TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamOverflow) { auto constexpr numLayers = 12; auto constexpr numHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr maxNumSequences = 8; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 2; auto constexpr onboardBlocks = true; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const stream = std::make_shared(); auto constexpr beamWidth = 1; SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto const maxSequenceLength = tokensPerBlock * maxBlocksPerSeq; auto const maxAttentionWindow = maxSequenceLength; auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}}; KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, beamWidth, std::vector{maxAttentionWindow}, std::nullopt, dtype, 0, stream, maxSequenceLength, true, onboardBlocks, CacheType::kSELF, std::nullopt, std::make_unique(1)); kvCacheManager.allocatePools(false); auto inputTokens0 = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); auto llmRequest0 = std::make_shared(0, 0, inputTokens0, samplingConfig, true); llmRequest0->setLoraTaskId(42); kvCacheManager.addSequence(0, inputTokens0->size(), beamWidth, llmRequest0); kvCacheManager.storeContextBlocks(*llmRequest0); auto events = getEvents(kvCacheManager); // The 'created' event should be evicted. EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); EXPECT_EQ(std::get