mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
This MR is a continuation of #6768. In the previous merge request, OOW (out-of-window) blocks are only detached when reuse is not enabled, that is, the block movement behavior is identical between SWA and full attention when reuse is enabled. This merge request attempts to enable OOW block detach when reuse is enabled. The required changes are: - Let KV cache manager keep track of which block is used by which sequence - Remove restriction for the eviction policy to be able to release a non-leaf block Along with the development, bugs inside freeChildren and offload mechanism under getFreeBlock is resolved because they will affect the functionality this merge request is trying to achieve. When a block goes OOW, it is released from the sequence, it will be available to be reclaimed and the block is held by the eviction policy for another sequence to acquire upon calling. On the other hand, we want to potentially store the sequence for reuse. To safely achieve this, the record of block ownership is done under WindowBlockManager::getFreeBlock. If the block acquired was originally owned by another sequence that is live inside the manager, then we invalidate the sequence for store for reuse. At the end of a sequence (when removeSequence is called toward it), the KV cache manager will check if the sequence has all blocks not reclaimed by another sequence. If so, then the sequence is safe to be stored for reuse and store for reuse action will be performed. Signed-off-by: eopXD <yuehtingc@nvidia.com>
222 lines
7.7 KiB
C++
222 lines
7.7 KiB
C++
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "tensorrt_llm/batch_manager/evictionPolicy.h"
|
|
|
|
using namespace tensorrt_llm::batch_manager::kv_cache_manager;
|
|
|
|
// This implements priority-based eviction.
|
|
// Blocks are assigned priority levels, with blocks at a lower priority evicted before blocks at a higher priority.
|
|
// New priority values always override the previous value.
|
|
|
|
namespace tensorrt_llm::batch_manager::eviction_policy
|
|
{
|
|
|
|
auto const kMinPriority = executor::KvCacheRetentionConfig::kMinRetentionPriority;
|
|
auto const kMaxPriority = executor::KvCacheRetentionConfig::kMaxRetentionPriority;
|
|
|
|
auto const kDefaultPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority;
|
|
executor::RetentionPriority const kDefaultSecondaryOffloadMinPriority = 30;
|
|
|
|
int const kNumCacheLevels = 2;
|
|
|
|
namespace
|
|
{
|
|
SizeType32 getCacheLevel(BlockPtr const& block)
|
|
{
|
|
return block->isPrimary() ? 0 : 1;
|
|
}
|
|
|
|
SizeType32 getPriorityIdx(executor::RetentionPriority priority)
|
|
{
|
|
return priority - kMinPriority;
|
|
}
|
|
} // namespace
|
|
|
|
void LRUEvictionPolicy::initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
|
|
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority)
|
|
{
|
|
SizeType32 startIdx = 0;
|
|
|
|
auto const defaultPriorityIdx = getPriorityIdx(kDefaultPriority);
|
|
|
|
// For each cache level, create a separate list of queues.
|
|
for (SizeType32 cacheLevel = 0; cacheLevel < kNumCacheLevels; cacheLevel++)
|
|
{
|
|
mFreeBlockIterators.reserve(mFreeBlockIterators.size() + sizes[cacheLevel]);
|
|
mFreeQueues.emplace_back(std::vector<FreeBlocksQueue>(kMaxPriority - kMinPriority + 1));
|
|
|
|
auto& freeQueue = mFreeQueues[cacheLevel][defaultPriorityIdx];
|
|
|
|
for (SizeType32 blockId = 0; blockId < sizes[cacheLevel]; blockId++)
|
|
{
|
|
// Initialize all blocks to be the default priority level
|
|
mFreeBlockIterators.emplace_back(freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId]));
|
|
}
|
|
|
|
startIdx += sizes[cacheLevel];
|
|
}
|
|
mNumFreeBlocksPerLevel = sizes;
|
|
|
|
mSecondaryOffloadMinPriority = secondaryOffloadMinPriority.value_or(kDefaultSecondaryOffloadMinPriority);
|
|
}
|
|
|
|
bool LRUEvictionPolicy::verifyQueueIntegrity()
|
|
{
|
|
bool queueCompromised = false;
|
|
for (SizeType32 cacheLevel = 0; cacheLevel < 2; cacheLevel++)
|
|
{
|
|
for (SizeType32 level = 0; level < kMaxPriority - kMinPriority + 1; level++)
|
|
{
|
|
for (auto const& block : mFreeQueues[cacheLevel][level])
|
|
{
|
|
if ((cacheLevel == 0 && !block->isPrimary()) || (cacheLevel == 1 && block->isPrimary()))
|
|
{
|
|
TLLM_LOG_WARNING("Found %s block (id %d) at cacheLevel %d",
|
|
block->isPrimary() ? "primary" : "secondary", block->getBlockId(), cacheLevel);
|
|
queueCompromised = true;
|
|
}
|
|
if (block->hasRefs())
|
|
{
|
|
TLLM_LOG_WARNING(
|
|
"Found block (id %d) with references at cacheLevel %d", block->getBlockId(), cacheLevel);
|
|
queueCompromised = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
TLLM_LOG_DEBUG("LRUEvictionPolicy queues are %s", queueCompromised ? "compromised" : "not compromised");
|
|
return !queueCompromised;
|
|
}
|
|
|
|
std::tuple<BlockPtr, bool> LRUEvictionPolicy::getFreeBlock(SizeType32 cacheLevel)
|
|
{
|
|
for (SizeType32 level = 0; level < kMaxPriority - kMinPriority + 1; level++)
|
|
{
|
|
// Find the first non-empty queue, and return the first block.
|
|
if (!mFreeQueues[cacheLevel][level].empty())
|
|
{
|
|
auto block = mFreeQueues[cacheLevel][level].front();
|
|
|
|
// mFreeQueues only contains leaf blocks, so no need to iterate through the next block pointers.
|
|
// It's possible to have a primary block with children in secondary memory. We handle this
|
|
// by freeing all descendants in WindowBlockManager::getFreeBlock. This is done either by
|
|
// offloading (preferred method) or explicitly.
|
|
return std::make_tuple(block, cacheLevel == 0 && level >= mSecondaryOffloadMinPriority);
|
|
}
|
|
}
|
|
TLLM_THROW("No free block found. This shouldn't happen!");
|
|
}
|
|
|
|
void LRUEvictionPolicy::releaseBlock(BlockPtr block)
|
|
{
|
|
releaseBlock(block, false);
|
|
}
|
|
|
|
void LRUEvictionPolicy::releaseBlock(BlockPtr block, bool toFront)
|
|
{
|
|
SizeType32 const cacheLevel = getCacheLevel(block);
|
|
SizeType32 const id = block->getBlockId();
|
|
|
|
// If there are no children, this is a leaf block. Insert into a queue.
|
|
auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())];
|
|
if (toFront)
|
|
{
|
|
mFreeBlockIterators[id] = q.insert(q.begin(), block);
|
|
}
|
|
else
|
|
{
|
|
mFreeBlockIterators[id] = q.insert(q.end(), block);
|
|
}
|
|
|
|
mNumFreeBlocksPerLevel[cacheLevel]++;
|
|
|
|
if (block->getDurationMs().has_value()
|
|
&& block->getPriority() != executor::KvCacheRetentionConfig::kDefaultRetentionPriority)
|
|
{
|
|
auto expirationTime = getTime() + *block->getDurationMs();
|
|
block->setExpirationTime(expirationTime);
|
|
mExpiringBlockHeap.emplace(block);
|
|
}
|
|
}
|
|
|
|
SizeType32 LRUEvictionPolicy::getNumFreeBlocks(SizeType32 cacheLevel)
|
|
{
|
|
return mNumFreeBlocksPerLevel[cacheLevel];
|
|
}
|
|
|
|
void LRUEvictionPolicy::claimBlock(BlockPtr block)
|
|
{
|
|
claimBlock(block, std::nullopt, std::nullopt);
|
|
}
|
|
|
|
void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority,
|
|
std::optional<std::chrono::milliseconds> durationMs)
|
|
{
|
|
SizeType32 const id = block->getBlockId();
|
|
SizeType32 const cacheLevel = getCacheLevel(block);
|
|
|
|
if (mFreeBlockIterators[id] != std::nullopt)
|
|
{
|
|
mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
|
|
mNumFreeBlocksPerLevel[cacheLevel] -= 1;
|
|
}
|
|
|
|
mFreeBlockIterators[id] = std::nullopt;
|
|
|
|
if (priority.has_value())
|
|
{
|
|
block->setPriority(*priority);
|
|
}
|
|
|
|
mExpiringBlockHeap.erase(block);
|
|
block->setDurationMs(durationMs);
|
|
}
|
|
|
|
std::chrono::steady_clock::time_point::duration LRUEvictionPolicy::getTime() const
|
|
{
|
|
return std::chrono::steady_clock::now().time_since_epoch();
|
|
}
|
|
|
|
void LRUEvictionPolicy::refresh()
|
|
{
|
|
while (!mExpiringBlockHeap.empty())
|
|
{
|
|
auto const block = *mExpiringBlockHeap.begin();
|
|
if (block->getExpirationTime() > getTime())
|
|
{
|
|
break;
|
|
}
|
|
|
|
auto const id = block->getBlockId();
|
|
auto const level = getCacheLevel(block);
|
|
|
|
mExpiringBlockHeap.erase(mExpiringBlockHeap.begin());
|
|
|
|
if (mFreeBlockIterators[id] != std::nullopt)
|
|
{
|
|
// This is already in another queue. Delete it, and bring it down to the default queue
|
|
mFreeQueues[level][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
|
|
auto& q = mFreeQueues[level][getPriorityIdx(kDefaultPriority)];
|
|
mFreeBlockIterators[id] = q.insert(q.end(), block);
|
|
}
|
|
block->setPriority(kDefaultPriority);
|
|
}
|
|
}
|
|
|
|
} // namespace tensorrt_llm::batch_manager::eviction_policy
|