mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[https://nvbugs/5625990][fix] Respect VSWA scheme when doing block store for reuse and load block for reuse in KV cache manager (#10183)
Signed-off-by: eopXD <yuehtingc@nvidia.com>
This commit is contained in:
parent
2f8d6d25a8
commit
9cee32ab39
@ -380,6 +380,7 @@ public:
|
||||
, mBeamWidth(beamWidth)
|
||||
, mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
|
||||
, mNumFrontBlocksRemoved(0)
|
||||
, mCurrentPrepopulatedPromptLen(std::numeric_limits<SizeType32>::max())
|
||||
{
|
||||
auto const numWindowSizes = windowSizeToMetadata.size();
|
||||
mCacheBlockIds.reserve(numWindowSizes);
|
||||
@ -500,6 +501,20 @@ public:
|
||||
return mKvCacheRetentionConfig.getDirectory();
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 getCurrentPrepopulatedPromptLen() const
|
||||
{
|
||||
return mCurrentPrepopulatedPromptLen;
|
||||
}
|
||||
|
||||
void setCurrentPrepopulatedPromptLen(SizeType32 currentPrepopulatedPromptLen)
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(currentPrepopulatedPromptLen <= mCurrentPrepopulatedPromptLen,
|
||||
"currentPrepopulatedPromptLen must be updated non-increasingly due to the "
|
||||
"assumption that smaller window sizes have shorter or equal"
|
||||
"currentPrepopulatedPromptLen in WindowSizeManager::loadOrAllocateBlocks.");
|
||||
mCurrentPrepopulatedPromptLen = currentPrepopulatedPromptLen;
|
||||
}
|
||||
|
||||
private:
|
||||
// Request id of the sequence
|
||||
LlmRequest::RequestIdType mRequestId;
|
||||
@ -517,6 +532,8 @@ private:
|
||||
SizeType32 mNumFrontBlocksRemoved;
|
||||
// Set of used blocks by the sequence
|
||||
std::set<KVCacheBlock::IdType> mUsedBlocks;
|
||||
// Current prepopulated prompt length
|
||||
SizeType32 mCurrentPrepopulatedPromptLen;
|
||||
};
|
||||
|
||||
// attach metadata to a pool pointer
|
||||
|
||||
@ -1224,7 +1224,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
|
||||
auto [partialMatch, numMatched, matchingBlock] = searchRoot != nullptr && blockItr != blockKeys.end()
|
||||
? searchRoot->findMatchingBlock(*blockItr, mEnablePartialReuse, mCopyOnPartialReuse)
|
||||
: std::make_tuple(false, 0, nullptr);
|
||||
if (matchingBlock != nullptr)
|
||||
if (matchingBlock != nullptr && numMatchedTokens + numMatched <= sequence.getCurrentPrepopulatedPromptLen())
|
||||
{
|
||||
KVCacheBlock::IdType matchingBlockId = matchingBlock->getBlockId();
|
||||
|
||||
@ -1338,6 +1338,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
|
||||
}
|
||||
}
|
||||
|
||||
sequence.setCurrentPrepopulatedPromptLen(numMatchedTokens);
|
||||
return numMatchedTokens;
|
||||
}
|
||||
|
||||
@ -1731,9 +1732,22 @@ std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
|
||||
// Released block will be stored when reuse is enabled.
|
||||
// Reuse is implied to be enabled if llmRequest is provided.
|
||||
std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
|
||||
|
||||
// For now, the attention kernel only accepts a single
|
||||
// "prepopulatedPromptLen", that is, all window sizes will use the same
|
||||
// prepopulated prompt length, so it is meaningless right now to save
|
||||
// blocks only for a certain window size while blocks in the other
|
||||
// window size are not valid for saving for reuse.
|
||||
bool isAllWindowSizesValidForStoreForReuse = true;
|
||||
for (auto& [windowSize, manager] : mWindowBlockManagers)
|
||||
{
|
||||
isAllWindowSizesValidForStoreForReuse &= manager.isSequenceValidForStoreForReuse(sequence.getRequestId());
|
||||
}
|
||||
|
||||
for (auto& [_, manager] : mWindowBlockManagers)
|
||||
{
|
||||
if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1)
|
||||
if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1
|
||||
|| !isAllWindowSizesValidForStoreForReuse)
|
||||
{
|
||||
lastStoredId = manager.releaseBlocks(sequence, std::nullopt);
|
||||
}
|
||||
|
||||
@ -147,6 +147,7 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
|
||||
|
||||
auto constexpr requestId = 42;
|
||||
GenerationRequest seq0{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
blockManager.holdSequence(seq0.getRequestId());
|
||||
blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false);
|
||||
auto constexpr occupiedBlocks = (numBlocksPerBeam - 1) + beamWidth;
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - occupiedBlocks);
|
||||
@ -179,15 +180,18 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
|
||||
EXPECT_NO_THROW(
|
||||
blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false));
|
||||
GenerationRequest seq1{requestId + 1, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
blockManager.holdSequence(seq1.getRequestId());
|
||||
EXPECT_NO_THROW(
|
||||
blockManager.addSequence(seq1, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false));
|
||||
// same requestId not allowed
|
||||
GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
blockManager.holdSequence(seq2.getRequestId());
|
||||
EXPECT_THROW(
|
||||
blockManager.addSequence(seq2, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false),
|
||||
std::runtime_error);
|
||||
// no more blocks
|
||||
GenerationRequest seq3{requestId + 2, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
blockManager.holdSequence(seq3.getRequestId());
|
||||
EXPECT_THROW(
|
||||
blockManager.addSequence(seq3, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false),
|
||||
std::runtime_error);
|
||||
@ -800,39 +804,43 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
|
||||
// reuse blocks 0, 1, 2(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8]) :: p = partial reuse
|
||||
auto inputTokens0 = std::make_shared<VecTokens>(*inputTokens);
|
||||
inputTokens0->emplace_back(9);
|
||||
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens0, samplingConfig, isStreaming);
|
||||
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest0 = std::make_shared<LlmRequest>(
|
||||
seq0_dup.getRequestId(), maxNewTokens, inputTokens0, samplingConfig, isStreaming);
|
||||
promptLen0 = llmRequest0->getNumTokens(beamIdx);
|
||||
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
|
||||
blockManager.holdSequence(seq0.getRequestId());
|
||||
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq0_dup.getRequestId());
|
||||
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), promptLen0 - 1);
|
||||
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
// note that seq0 is holding blocks 0, 1 and 2 until releaseBlocks is called
|
||||
// note that seq0_dup is holding blocks 0, 1 and 2 until releaseBlocks is called
|
||||
|
||||
// input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
// reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 4
|
||||
auto inputTokens1 = std::make_shared<VecTokens>(llmRequest1->getTokens(0));
|
||||
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming);
|
||||
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest1 = std::make_shared<LlmRequest>(
|
||||
seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, isStreaming);
|
||||
promptLen1 = llmRequest1->getNumTokens(beamIdx);
|
||||
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
|
||||
blockManager.holdSequence(seq1.getRequestId());
|
||||
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq1_dup.getRequestId());
|
||||
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock);
|
||||
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
|
||||
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
|
||||
llmRequest1->addNewToken(10, beamIdx); // block 4 contains [8, 9, 10]
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
|
||||
|
||||
// block 2 is stored for reuse (block contains [8]). nb! Last token of last block is never stored
|
||||
blockManager.releaseBlocks(seq0, llmRequest0);
|
||||
blockManager.releaseSequence(seq0.getRequestId());
|
||||
blockManager.releaseBlocks(seq0_dup, llmRequest0);
|
||||
blockManager.releaseSequence(seq0_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
// block 4 is stored for reuse (block contains [8, 9]). nb! Last token of last block is never stored
|
||||
blockManager.releaseBlocks(seq1, llmRequest1);
|
||||
blockManager.releaseSequence(seq1.getRequestId());
|
||||
blockManager.releaseBlocks(seq1_dup, llmRequest1);
|
||||
blockManager.releaseSequence(seq1_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
|
||||
|
||||
@ -932,20 +940,22 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest)
|
||||
// nb! LlmRequest retains state calculated during addSequence, this state affects result.
|
||||
// Calling addSequence a second time with same LlmRequest object will produce incorrect state.
|
||||
// Create new llmRequest4 instance to avoid this issue.
|
||||
llmRequest4 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming);
|
||||
GenerationRequest seq4_dup{14, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest4 = std::make_shared<LlmRequest>(
|
||||
seq4_dup.getRequestId(), maxNewTokens, inputTokens4, samplingConfig, isStreaming);
|
||||
promptLen4 = llmRequest4->getNumTokens(beamIdx);
|
||||
numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock());
|
||||
blockManager.holdSequence(seq4.getRequestId());
|
||||
blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq4_dup.getRequestId());
|
||||
blockManager.addSequence(seq4_dup, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow);
|
||||
EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 2);
|
||||
EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
EXPECT_THAT(seq4_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
numTokens = llmRequest4->getNumTokens(beamIdx);
|
||||
numBlocks = tc::ceilDiv(numTokens, tokensPerBlock);
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
|
||||
blockManager.releaseBlocks(seq4, llmRequest4);
|
||||
blockManager.releaseSequence(seq4.getRequestId());
|
||||
blockManager.releaseBlocks(seq4_dup, llmRequest4);
|
||||
blockManager.releaseSequence(seq4_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
|
||||
|
||||
@ -1107,19 +1117,20 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// add both requests again and then remove them
|
||||
// reuse blocks 0, 1 and get new block 4
|
||||
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
|
||||
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest0 = std::make_shared<LlmRequest>(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig,
|
||||
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
|
||||
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
|
||||
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences);
|
||||
promptLen0 = llmRequest0->getNumTokens(beamIdx);
|
||||
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
|
||||
blockManager.holdSequence(seq0.getRequestId());
|
||||
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq0_dup.getRequestId());
|
||||
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
llmRequest0->addNewToken(3, beamIdx);
|
||||
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
|
||||
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
|
||||
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
|
||||
@ -1128,29 +1139,30 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest)
|
||||
auto inputTokenExtraIds1 = std::make_shared<VecTokenExtraIds>(*inputTokenExtraIds);
|
||||
inputTokenExtraIds1->push_back(0);
|
||||
inputTokenExtraIds1->push_back(0);
|
||||
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming,
|
||||
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest1 = std::make_shared<LlmRequest>(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig,
|
||||
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
|
||||
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
|
||||
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1, numReturnSequences);
|
||||
promptLen1 = llmRequest1->getNumTokens(beamIdx);
|
||||
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
|
||||
blockManager.holdSequence(seq1.getRequestId());
|
||||
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq1_dup.getRequestId());
|
||||
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
|
||||
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
llmRequest1->addNewToken(5, beamIdx);
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
|
||||
|
||||
blockManager.releaseBlocks(seq0, llmRequest0);
|
||||
blockManager.releaseSequence(seq0.getRequestId());
|
||||
blockManager.releaseBlocks(seq0_dup, llmRequest0);
|
||||
blockManager.releaseSequence(seq0_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
// blocks 2 is stored for reuse (block contains [(2, 0), (3, 0), (4, 0)])
|
||||
blockManager.releaseBlocks(seq1, llmRequest1);
|
||||
blockManager.releaseSequence(seq1.getRequestId());
|
||||
blockManager.releaseBlocks(seq1_dup, llmRequest1);
|
||||
blockManager.releaseSequence(seq1_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
|
||||
|
||||
@ -1498,13 +1510,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
|
||||
// add both requests again and then remove them
|
||||
// inputTokens = (0, 1, 2, 3, 4, 5, 6, 7, 8)
|
||||
// reuse blocks 0, 1 and get new block 4
|
||||
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
|
||||
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest0 = std::make_shared<LlmRequest>(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig,
|
||||
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
|
||||
promptLen0 = llmRequest0->getNumTokens(beamIdx);
|
||||
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
|
||||
blockManager.holdSequence(seq0.getRequestId());
|
||||
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq0_dup.getRequestId());
|
||||
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
// nb! addNewToken adds new generated token, number of input tokens stay the same.
|
||||
// calling addNewToken before addSequence potentially triggers this error message:
|
||||
// Assertion failed: prepopulatedPromptLen < promptLen
|
||||
@ -1512,34 +1525,35 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest)
|
||||
// but promptLen is number of input tokens.
|
||||
llmRequest0->addNewToken(9, beamIdx);
|
||||
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
|
||||
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
|
||||
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4}));
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
|
||||
// inputTokens1 = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
|
||||
auto inputTokens1 = std::make_shared<VecTokens>(llmRequest1->getTokens(0));
|
||||
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
|
||||
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest1 = std::make_shared<LlmRequest>(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig,
|
||||
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId);
|
||||
promptLen1 = llmRequest1->getNumTokens(beamIdx);
|
||||
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
|
||||
// reuse 0, 1, 2(p) ([0,1,2,3], [4,5,6,7], [8])
|
||||
blockManager.holdSequence(seq1.getRequestId());
|
||||
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq1_dup.getRequestId());
|
||||
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
|
||||
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2}));
|
||||
llmRequest1->addNewToken(10, beamIdx);
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1);
|
||||
|
||||
// store block 4 for reuse ([8])
|
||||
blockManager.releaseBlocks(seq0, llmRequest0);
|
||||
blockManager.releaseSequence(seq0.getRequestId());
|
||||
blockManager.releaseBlocks(seq0_dup, llmRequest0);
|
||||
blockManager.releaseSequence(seq0_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
// blocks 2 is stored for reuse (block contains [8, 9]). nb! Last token of last block is not stored
|
||||
blockManager.releaseBlocks(seq1, llmRequest1);
|
||||
blockManager.releaseSequence(seq1.getRequestId());
|
||||
blockManager.releaseBlocks(seq1_dup, llmRequest1);
|
||||
blockManager.releaseSequence(seq1_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
|
||||
|
||||
@ -1761,20 +1775,21 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// add both requests again and then remove them
|
||||
llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
|
||||
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest0 = std::make_shared<LlmRequest>(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig,
|
||||
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
|
||||
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds);
|
||||
promptLen0 = llmRequest0->getNumTokens(beamIdx);
|
||||
numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
|
||||
// reuse blocks 0, 1 and get new block 6
|
||||
blockManager.holdSequence(seq0.getRequestId());
|
||||
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq0_dup.getRequestId());
|
||||
blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
||||
llmRequest0->addNewToken(3, beamIdx);
|
||||
EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock);
|
||||
EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6}));
|
||||
EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6}));
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
|
||||
@ -1783,28 +1798,29 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest)
|
||||
auto inputTokenExtraIds1 = std::make_shared<VecTokenExtraIds>(*inputTokenExtraIds);
|
||||
inputTokenExtraIds1->push_back(0);
|
||||
inputTokenExtraIds1->push_back(0);
|
||||
llmRequest1 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt,
|
||||
false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
||||
llmRequest1 = std::make_shared<LlmRequest>(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig,
|
||||
isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2,
|
||||
std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false,
|
||||
std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt,
|
||||
LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1);
|
||||
promptLen1 = llmRequest1->getNumTokens(beamIdx);
|
||||
numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock());
|
||||
blockManager.holdSequence(seq1.getRequestId());
|
||||
blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
blockManager.holdSequence(seq1_dup.getRequestId());
|
||||
blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow);
|
||||
EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1);
|
||||
EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
|
||||
EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5}));
|
||||
llmRequest1->addNewToken(5, beamIdx);
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2);
|
||||
|
||||
blockManager.releaseBlocks(seq0, llmRequest0);
|
||||
blockManager.releaseSequence(seq0.getRequestId());
|
||||
blockManager.releaseBlocks(seq0_dup, llmRequest0);
|
||||
blockManager.releaseSequence(seq0_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks);
|
||||
blockManager.releaseBlocks(seq1, llmRequest1);
|
||||
blockManager.releaseSequence(seq1.getRequestId());
|
||||
blockManager.releaseBlocks(seq1_dup, llmRequest1);
|
||||
blockManager.releaseSequence(seq1_dup.getRequestId());
|
||||
EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
|
||||
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
|
||||
|
||||
|
||||
@ -1211,10 +1211,6 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Currently failing due to accuracy drop, https://nvbugspro.nvidia.com/bug/5625990"
|
||||
)
|
||||
def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user