mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
This MR is a continuation of #6768. In the previous merge request, OOW (out-of-window) blocks are only detached when reuse is not enabled, that is, the block movement behavior is identical between SWA and full attention when reuse is enabled. This merge request attempts to enable OOW block detach when reuse is enabled. The required changes are: - Let KV cache manager keep track of which block is used by which sequence - Remove restriction for the eviction policy to be able to release a non-leaf block Along with the development, bugs inside freeChildren and offload mechanism under getFreeBlock is resolved because they will affect the functionality this merge request is trying to achieve. When a block goes OOW, it is released from the sequence, it will be available to be reclaimed and the block is held by the eviction policy for another sequence to acquire upon calling. On the other hand, we want to potentially store the sequence for reuse. To safely achieve this, the record of block ownership is done under WindowBlockManager::getFreeBlock. If the block acquired was originally owned by another sequence that is live inside the manager, then we invalidate the sequence for store for reuse. At the end of a sequence (when removeSequence is called toward it), the KV cache manager will check if the sequence has all blocks not reclaimed by another sequence. If so, then the sequence is safe to be stored for reuse and store for reuse action will be performed. Signed-off-by: eopXD <yuehtingc@nvidia.com>
108 lines
4.4 KiB
C++
108 lines
4.4 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/batch_manager/kvCacheManager.h"
|
|
|
|
#include <chrono>
|
|
#include <vector>
|
|
|
|
using namespace tensorrt_llm::batch_manager::kv_cache_manager;
|
|
|
|
namespace tensorrt_llm::batch_manager::eviction_policy
|
|
{
|
|
|
|
class BaseEvictionPolicy
|
|
{
|
|
public:
|
|
virtual ~BaseEvictionPolicy() = default;
|
|
|
|
// TODO(TRTLLM-1564): Don't use a separate `initialize` function. Ensure eviction policies can't be in-between a
|
|
// state of construction and initialization.
|
|
virtual void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
|
|
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority)
|
|
= 0;
|
|
|
|
/// @brief Get a free block from the specified cache level
|
|
/// @returns The pointer to the free block, along with whether it can be offloaded
|
|
virtual std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) = 0;
|
|
/// @brief Release a block. Prioritize the block for eviction if toFront=true
|
|
virtual void releaseBlock(BlockPtr block) = 0;
|
|
virtual void releaseBlock(BlockPtr block, bool toFront) = 0;
|
|
/// @brief Get the amount of free blocks in the primary memory pool
|
|
virtual SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) = 0;
|
|
/// @brief Claim a free block. Called when the cache manager allocates or reuses a new block
|
|
virtual void claimBlock(BlockPtr block) = 0;
|
|
virtual void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority,
|
|
std::optional<std::chrono::milliseconds> durationMs)
|
|
= 0;
|
|
/// @brief Perform any per-iteration bookkeeping
|
|
virtual void refresh() = 0;
|
|
|
|
virtual bool verifyQueueIntegrity() = 0;
|
|
};
|
|
|
|
struct ExpiringBlockComparator
|
|
{
|
|
bool operator()(BlockPtr const& a, BlockPtr const& b) const
|
|
{
|
|
// If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the
|
|
// raw pointer values.
|
|
return a->getExpirationTime() != b->getExpirationTime() ? a->getExpirationTime() < b->getExpirationTime()
|
|
: a.get() < b.get();
|
|
}
|
|
};
|
|
|
|
class LRUEvictionPolicy : public BaseEvictionPolicy
|
|
{
|
|
public:
|
|
void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
|
|
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority) override;
|
|
std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) override;
|
|
|
|
void releaseBlock(BlockPtr block) override;
|
|
void releaseBlock(BlockPtr block, bool toFront) override;
|
|
|
|
SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) override;
|
|
|
|
void claimBlock(BlockPtr block) override;
|
|
void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority,
|
|
std::optional<std::chrono::milliseconds> durationMs) override;
|
|
|
|
// Check the expiring blocks heap, and move expired blocks back to the default queue.
|
|
void refresh() override;
|
|
|
|
// Making this public and virtual makes it possible to test.
|
|
[[nodiscard]] virtual std::chrono::steady_clock::time_point::duration getTime() const;
|
|
|
|
bool verifyQueueIntegrity() override;
|
|
|
|
private:
|
|
// Queues of available leaf blocks, split by cache level and priority level
|
|
std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
|
|
// Iterators to block entries in mFreeQueues
|
|
std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
|
|
// Amount of free blocks at each cache level
|
|
std::vector<SizeType32> mNumFreeBlocksPerLevel;
|
|
// Secondary offload threshold. Blocks below this priority won't be evicted.
|
|
executor::RetentionPriority mSecondaryOffloadMinPriority;
|
|
// Heap of block times
|
|
std::set<BlockPtr, ExpiringBlockComparator> mExpiringBlockHeap;
|
|
};
|
|
|
|
} // namespace tensorrt_llm::batch_manager::eviction_policy
|