[https://nvbugs/5625990][fix] Respect VSWA scheme when doing block store for reuse and load block for reuse in KV cache manager (#10183)

Signed-off-by: eopXD <yuehtingc@nvidia.com>
This commit is contained in:
Yueh-Ting (eop) Chen 2025-12-29 14:29:14 +08:00 committed by GitHub
parent 2f8d6d25a8
commit 9cee32ab39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 122 additions and 79 deletions

View File

@ -380,6 +380,7 @@ public:
, mBeamWidth(beamWidth)
, mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
, mNumFrontBlocksRemoved(0)
, mCurrentPrepopulatedPromptLen(std::numeric_limits<SizeType32>::max())
{
auto const numWindowSizes = windowSizeToMetadata.size();
mCacheBlockIds.reserve(numWindowSizes);
@ -500,6 +501,20 @@ public:
return mKvCacheRetentionConfig.getDirectory();
}
[[nodiscard]] SizeType32 getCurrentPrepopulatedPromptLen() const
{
return mCurrentPrepopulatedPromptLen;
}
void setCurrentPrepopulatedPromptLen(SizeType32 currentPrepopulatedPromptLen)
{
TLLM_CHECK_WITH_INFO(currentPrepopulatedPromptLen <= mCurrentPrepopulatedPromptLen,
"currentPrepopulatedPromptLen must be updated non-increasingly due to the "
"assumption that smaller window sizes have shorter or equal"
"currentPrepopulatedPromptLen in WindowSizeManager::loadOrAllocateBlocks.");
mCurrentPrepopulatedPromptLen = currentPrepopulatedPromptLen;
}
private:
// Request id of the sequence
LlmRequest::RequestIdType mRequestId;
@ -517,6 +532,8 @@ private:
SizeType32 mNumFrontBlocksRemoved;
// Set of used blocks by the sequence
std::set<KVCacheBlock::IdType> mUsedBlocks;
// Current prepopulated prompt length
SizeType32 mCurrentPrepopulatedPromptLen;
};
// attach metadata to a pool pointer

View File

@ -1224,7 +1224,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
auto [partialMatch, numMatched, matchingBlock] = searchRoot != nullptr && blockItr != blockKeys.end()
? searchRoot->findMatchingBlock(*blockItr, mEnablePartialReuse, mCopyOnPartialReuse)
: std::make_tuple(false, 0, nullptr);
if (matchingBlock != nullptr)
if (matchingBlock != nullptr && numMatchedTokens + numMatched <= sequence.getCurrentPrepopulatedPromptLen())
{
KVCacheBlock::IdType matchingBlockId = matchingBlock->getBlockId();
@ -1338,6 +1338,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
}
}
sequence.setCurrentPrepopulatedPromptLen(numMatchedTokens);
return numMatchedTokens;
}
@ -1731,9 +1732,22 @@ std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
// Released block will be stored when reuse is enabled.
// Reuse is implied to be enabled if llmRequest is provided.
std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
// For now, the attention kernel only accepts a single
// "prepopulatedPromptLen", that is, all window sizes will use the same
// prepopulated prompt length, so it is meaningless right now to save
// blocks only for a certain window size while blocks in the other
// window size are not valid for saving for reuse.
bool isAllWindowSizesValidForStoreForReuse = true;
for (auto& [windowSize, manager] : mWindowBlockManagers)
{
isAllWindowSizesValidForStoreForReuse &= manager.isSequenceValidForStoreForReuse(sequence.getRequestId());
}
for (auto& [_, manager] : mWindowBlockManagers)
{
if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1)
if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1
|| !isAllWindowSizesValidForStoreForReuse)
{
lastStoredId = manager.releaseBlocks(sequence, std::nullopt);
}

View File

@ -147,6 +147,7 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
auto constexpr requestId = 42;
GenerationRequest seq0{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
blockManager.holdSequence(seq0.getRequestId());
blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false);
auto constexpr occupiedBlocks = (numBlocksPerBeam - 1) + beamWidth;
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - occupiedBlocks);
@ -179,15 +180,18 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
EXPECT_NO_THROW(
blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false));
GenerationRequest seq1{requestId + 1, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
blockManager.holdSequence(seq1.getRequestId());
EXPECT_NO_THROW(
blockManager.addSequence(seq1, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false));
// same requestId not allowed
GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
blockManager.holdSequence(seq2.getRequestId());
EXPECT_THROW(
blockManager.addSequence(seq2, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false),
std::runtime_error);
// no more blocks
GenerationRequest seq3{requestId + 2, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
blockManager.holdSequence(seq3.getRequestId());
EXPECT_THROW(
blockManager.addSequence(seq3, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false),
std::runtime_error);
@ -800,39 +804,43 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
// reuse blocks 0, 1, 2(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8]) :: p = partial reuse
auto inputTokens0 = std::make_shared<VecTokens>(*inputTokens);
inputTokens0->emplace_back(9);
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens0, samplingConfig, isStreaming);
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest0 = std::make_shared<LlmRequest>(
seq0_dup.getRequestId(), maxNewTokens, inputTokens0, samplingConfig, isStreaming);
promptLen0 = llmRequest0->getNumTokens(beamIdx);
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
blockManager.holdSequence(seq0.getRequestId());
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
blockManager.holdSequence(seq0_dup.getRequestId());
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), promptLen0 - 1);
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
// note that seq0 is holding blocks 0, 1 and 2 until releaseBlocks is called
// note that seq0_dup is holding blocks 0, 1 and 2 until releaseBlocks is called
// input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
// reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 4
auto inputTokens1 = std::make_shared<VecTokens>(llmRequest1->getTokens(0));
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming);
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest1 = std::make_shared<LlmRequest>(
seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, isStreaming);
promptLen1 = llmRequest1->getNumTokens(beamIdx);
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
blockManager.holdSequence(seq1.getRequestId());
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
blockManager.holdSequence(seq1_dup.getRequestId());
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock);
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
llmRequest1->addNewToken(10, beamIdx); // block 4 contains [8, 9, 10]
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
// block 2 is stored for reuse (block contains [8]). nb! Last token of last block is never stored
blockManager.releaseBlocks(seq0, llmRequest0);
blockManager.releaseSequence(seq0.getRequestId());
blockManager.releaseBlocks(seq0_dup, llmRequest0);
blockManager.releaseSequence(seq0_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
// block 4 is stored for reuse (block contains [8, 9]). nb! Last token of last block is never stored
blockManager.releaseBlocks(seq1, llmRequest1);
blockManager.releaseSequence(seq1.getRequestId());
blockManager.releaseBlocks(seq1_dup, llmRequest1);
blockManager.releaseSequence(seq1_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
@ -932,20 +940,22 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
// nb! LlmRequest retains state calculated during addSequence, this state affects result.
// Calling addSequence a second time with same LlmRequest object will produce incorrect state.
// Create new llmRequest4 instance to avoid this issue.
llmRequest4 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming);
GenerationRequest seq4_dup{14, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest4 = std::make_shared<LlmRequest>(
seq4_dup.getRequestId(), maxNewTokens, inputTokens4, samplingConfig, isStreaming);
promptLen4 = llmRequest4->getNumTokens(beamIdx);
numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock());
blockManager.holdSequence(seq4.getRequestId());
blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
blockManager.holdSequence(seq4_dup.getRequestId());
blockManager.addSequence(seq4_dup, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 2);
EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
EXPECT_THAT(seq4_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
numTokens = llmRequest4->getNumTokens(beamIdx);
numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
blockManager.releaseBlocks(seq4, llmRequest4);
blockManager.releaseSequence(seq4.getRequestId());
blockManager.releaseBlocks(seq4_dup, llmRequest4);
blockManager.releaseSequence(seq4_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
@ -1107,19 +1117,20 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
///////////////////////////////////////////////////////////////////////////
// add both requests again and then remove them
// reuse blocks 0, 1 and get new block 4
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest0 = std::make_shared<LlmRequest>(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig,
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences);
promptLen0 = llmRequest0->getNumTokens(beamIdx);
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
blockManager.holdSequence(seq0.getRequestId());
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
blockManager.holdSequence(seq0_dup.getRequestId());
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
llmRequest0->addNewToken(3, beamIdx);
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
@ -1128,29 +1139,30 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
auto inputTokenExtraIds1 = std::make_shared<VecTokenExtraIds>(*inputTokenExtraIds);
inputTokenExtraIds1->push_back(0);
inputTokenExtraIds1->push_back(0);
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming,
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest1 = std::make_shared<LlmRequest>(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig,
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1, numReturnSequences);
promptLen1 = llmRequest1->getNumTokens(beamIdx);
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
blockManager.holdSequence(seq1.getRequestId());
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
blockManager.holdSequence(seq1_dup.getRequestId());
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
llmRequest1->addNewToken(5, beamIdx);
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
blockManager.releaseBlocks(seq0, llmRequest0);
blockManager.releaseSequence(seq0.getRequestId());
blockManager.releaseBlocks(seq0_dup, llmRequest0);
blockManager.releaseSequence(seq0_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
// blocks 2 is stored for reuse (block contains [(2, 0), (3, 0), (4, 0)])
blockManager.releaseBlocks(seq1, llmRequest1);
blockManager.releaseSequence(seq1.getRequestId());
blockManager.releaseBlocks(seq1_dup, llmRequest1);
blockManager.releaseSequence(seq1_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
@ -1498,13 +1510,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
// add both requests again and then remove them
// inputTokens = (0, 1, 2, 3, 4, 5, 6, 7, 8)
// reuse blocks 0, 1 and get new block 4
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest0 = std::make_shared<LlmRequest>(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig,
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
promptLen0 = llmRequest0->getNumTokens(beamIdx);
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
blockManager.holdSequence(seq0.getRequestId());
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
blockManager.holdSequence(seq0_dup.getRequestId());
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
// nb! addNewToken adds new generated token, number of input tokens stay the same.
// calling addNewToken before addSequence potentially triggers this error message:
// Assertion failed: prepopulatedPromptLen < promptLen
@ -1512,34 +1525,35 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
// but promptLen is number of input tokens.
llmRequest0->addNewToken(9, beamIdx);
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
// inputTokens1 = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
auto inputTokens1 = std::make_shared<VecTokens>(llmRequest1->getTokens(0));
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest1 = std::make_shared<LlmRequest>(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig,
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
promptLen1 = llmRequest1->getNumTokens(beamIdx);
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
// reuse 0, 1, 2(p) ([0,1,2,3], [4,5,6,7], [8])
blockManager.holdSequence(seq1.getRequestId());
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
blockManager.holdSequence(seq1_dup.getRequestId());
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
llmRequest1->addNewToken(10, beamIdx);
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
// store block 4 for reuse ([8])
blockManager.releaseBlocks(seq0, llmRequest0);
blockManager.releaseSequence(seq0.getRequestId());
blockManager.releaseBlocks(seq0_dup, llmRequest0);
blockManager.releaseSequence(seq0_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
// blocks 2 is stored for reuse (block contains [8, 9]). nb! Last token of last block is not stored
blockManager.releaseBlocks(seq1, llmRequest1);
blockManager.releaseSequence(seq1.getRequestId());
blockManager.releaseBlocks(seq1_dup, llmRequest1);
blockManager.releaseSequence(seq1_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
@ -1761,20 +1775,21 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
///////////////////////////////////////////////////////////////////////////
// add both requests again and then remove them
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest0 = std::make_shared<LlmRequest>(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig,
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds);
promptLen0 = llmRequest0->getNumTokens(beamIdx);
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
// reuse blocks 0, 1 and get new block 6
blockManager.holdSequence(seq0.getRequestId());
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
blockManager.holdSequence(seq0_dup.getRequestId());
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
llmRequest0->addNewToken(3, beamIdx);
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6}));
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6}));
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
@ -1783,28 +1798,29 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
auto inputTokenExtraIds1 = std::make_shared<VecTokenExtraIds>(*inputTokenExtraIds);
inputTokenExtraIds1->push_back(0);
inputTokenExtraIds1->push_back(0);
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
llmRequest1 = std::make_shared<LlmRequest>(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig,
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2,
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1);
promptLen1 = llmRequest1->getNumTokens(beamIdx);
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
blockManager.holdSequence(seq1.getRequestId());
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
blockManager.holdSequence(seq1_dup.getRequestId());
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
llmRequest1->addNewToken(5, beamIdx);
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2);
blockManager.releaseBlocks(seq0, llmRequest0);
blockManager.releaseSequence(seq0.getRequestId());
blockManager.releaseBlocks(seq0_dup, llmRequest0);
blockManager.releaseSequence(seq0_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
blockManager.releaseBlocks(seq1, llmRequest1);
blockManager.releaseSequence(seq1.getRequestId());
blockManager.releaseBlocks(seq1_dup, llmRequest1);
blockManager.releaseSequence(seq1_dup.getRequestId());
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);

View File

@ -1211,10 +1211,6 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip(
reason=
"Currently failing due to accuracy drop, https://nvbugspro.nvidia.com/bug/5625990"
)
def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(