mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
2063 lines
85 KiB
C++
2063 lines
85 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/batch_manager/kvCacheConnector.h"
|
|
#include "tensorrt_llm/batch_manager/kvCacheEventManager.h"
|
|
#include "tensorrt_llm/batch_manager/kvCacheType.h"
|
|
#include "tensorrt_llm/batch_manager/llmRequest.h" // TODO forward declare
|
|
#include "tensorrt_llm/common/optionalRef.h"
|
|
#include "tensorrt_llm/executor/executor.h"
|
|
#include "tensorrt_llm/executor/transferAgent.h"
|
|
#include "tensorrt_llm/kernels/kvCacheIndex.h"
|
|
#include "tensorrt_llm/runtime/bufferManager.h"
|
|
#include "tensorrt_llm/runtime/common.h"
|
|
#include "tensorrt_llm/runtime/cudaStream.h"
|
|
#include "tensorrt_llm/runtime/iBuffer.h"
|
|
#include "tensorrt_llm/runtime/iTensor.h"
|
|
#include "tensorrt_llm/runtime/modelConfig.h"
|
|
#include "tensorrt_llm/runtime/worldConfig.h"
|
|
#include <NvInferRuntime.h>
|
|
|
|
#include <array>
|
|
#include <cstdint>
|
|
#include <limits>
|
|
#include <list>
|
|
#include <memory>
|
|
#include <optional>
|
|
#include <set>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
namespace kvc = tensorrt_llm::executor::kv_cache;
|
|
|
|
namespace tensorrt_llm::batch_manager::eviction_policy
|
|
{
|
|
class BaseEvictionPolicy;
|
|
} // namespace tensorrt_llm::batch_manager::eviction_policy
|
|
|
|
namespace tensorrt_llm::batch_manager::kv_cache_manager
|
|
{
|
|
|
|
static constexpr SizeType32 kPrimaryLevel = 0;
|
|
|
|
static constexpr SizeType32 kSecondaryLevel = 1;
|
|
|
|
// Extra block buffer allocated for SWA to be able to always keep "window size"
|
|
// tokens held in the blocks.
|
|
static constexpr SizeType32 kSWAExtraBlock = 1;
|
|
|
|
class KVCacheBlock;
|
|
class BlockManager;
|
|
class KVCacheManager;
|
|
class KVCacheTransferManager;
|
|
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
using TokenIdType = tensorrt_llm::runtime::TokenIdType;
|
|
using VecTokens = std::vector<TokenIdType>;
|
|
using BeamTokens = std::vector<VecTokens>;
|
|
using BlockPtr = std::shared_ptr<KVCacheBlock>;
|
|
using FreeBlocksQueue = std::list<BlockPtr>;
|
|
using UniqueToken = tensorrt_llm::runtime::UniqueToken;
|
|
using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
|
|
using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
|
|
using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
|
|
using CacheSaltIDType = tensorrt_llm::runtime::CacheSaltIDType;
|
|
using MmKey = tensorrt_llm::executor::MmKey;
|
|
|
|
template <typename T>
|
|
using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
|
|
|
|
//! \brief Split vector into list of blocks of given size.
|
|
//! \param vec vector to split
|
|
//! \param usableSize part of the vector that is processed
|
|
//! \param elementsPerBlock desired size of blocks
|
|
//! \param allowPartial whether to append a block smaller than `elementsPerBlock` at the end
|
|
//! \return list of blocks
|
|
template <typename T>
|
|
std::list<std::vector<T>> chopVectorIntoBlocks(
|
|
std::vector<T> const& vec, SizeType32 usableSize, SizeType32 elementsPerBlock, bool allowPartial)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(
|
|
usableSize <= static_cast<SizeType32>(vec.size()), "usableSize=%d > %ld=vec.size()", usableSize, vec.size());
|
|
std::list<std::vector<T>> blockedVectors;
|
|
auto const vecEnd = vec.begin() + usableSize;
|
|
for (auto begin = vec.begin(); begin < vecEnd; begin += elementsPerBlock)
|
|
{
|
|
auto blockSize = std::min(elementsPerBlock, static_cast<SizeType32>(std::distance(begin, vecEnd)));
|
|
auto end = begin + blockSize;
|
|
if (blockSize == elementsPerBlock || allowPartial)
|
|
{
|
|
blockedVectors.emplace_back(begin, end);
|
|
}
|
|
}
|
|
return blockedVectors;
|
|
}
|
|
|
|
struct TempAttentionWindowInputs
|
|
{
|
|
bool pagedContextFMHA;
|
|
SizeType32 maxInputLen;
|
|
SizeType32 maxNumTokens;
|
|
};
|
|
|
|
struct WindowSizeMetadata
|
|
{
|
|
SizeType32 allottedPrimaryBlocks; // Number of primary blocks allotted to the windowSize
|
|
SizeType32 allottedSecondaryBlocks; // Number of secondary blocks allotted to the windowSize
|
|
SizeType32 absolutePoolsOffset; // cumulative number of pools up to manager
|
|
SizeType32 numPools; // number of managed pools
|
|
SizeType32 maxTokenNum; // Maximum token length per sequence (TODO: account for streamLLM)
|
|
SizeType32 maxBlocksPerSeq; // Maximum number of blocks per sequence
|
|
SizeType32 maxNumBlocks; // Number of primary+secondary blocks allotted to the windowSize
|
|
SizeType32 temporaryAttentionWindow; // Temporary kv cache length per sequence.
|
|
// Only needed when chunked context + sliding window attention are used
|
|
// together. And it should only be considered when allocating blocks.
|
|
SizeType32 windowSize;
|
|
bool isSWA;
|
|
|
|
std::string toString()
|
|
{
|
|
return tensorrt_llm::common::fmtstr(
|
|
"WindowSizeMetadata{ .allottedPrimaryBlocks=%d, .allottedSecondaryBlocks=%d, .absolutePoolsOffset=%d, "
|
|
".numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d, "
|
|
".windowSize=%d, .isSWA=%d }",
|
|
allottedPrimaryBlocks, allottedSecondaryBlocks, absolutePoolsOffset, numPools, maxTokenNum, maxBlocksPerSeq,
|
|
maxNumBlocks, temporaryAttentionWindow, windowSize, isSWA);
|
|
}
|
|
};
|
|
|
|
std::vector<MmKey> generateBlockHashExtraKeys(
|
|
tensorrt_llm::batch_manager::LlmRequest const& llmRequest, SizeType32 startTokenIdx, SizeType32 endTokenIdx);
|
|
|
|
struct BlockKey
|
|
{
|
|
bool usesExtraIds = false;
|
|
std::optional<LoraTaskIdType> loraTaskId = std::nullopt;
|
|
VecUniqueTokens uniqueTokens;
|
|
|
|
// Extra keys for multimodal data (similar to VLLM's approach)
|
|
// Each extra key is a pair of (mm_hash, start_offset_in_block)
|
|
std::vector<MmKey> extraKeys;
|
|
std::optional<CacheSaltIDType> cacheSaltID = std::nullopt;
|
|
|
|
BlockKey() = default;
|
|
|
|
explicit BlockKey(VecTokens const& tokens, std::optional<LoraTaskIdType> loraTaskId = std::nullopt)
|
|
: loraTaskId{loraTaskId}
|
|
{
|
|
uniqueTokens.reserve(tokens.size());
|
|
for (auto const& token : tokens)
|
|
{
|
|
uniqueTokens.push_back(UniqueToken{token, 0});
|
|
}
|
|
}
|
|
|
|
explicit BlockKey(bool usesExtraIds, std::optional<LoraTaskIdType> loraTaskId, VecUniqueTokens uniqueTokens,
|
|
std::vector<MmKey> extraKeys = {}, std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
|
|
: usesExtraIds{usesExtraIds}
|
|
, loraTaskId{loraTaskId}
|
|
, uniqueTokens{std::move(uniqueTokens)}
|
|
, extraKeys{std::move(extraKeys)}
|
|
, cacheSaltID{cacheSaltID}
|
|
{
|
|
}
|
|
|
|
bool operator==(BlockKey const& other) const noexcept;
|
|
|
|
int partialMatch(BlockKey const& other) const noexcept
|
|
{
|
|
SizeType32 numMatched{0};
|
|
if (loraTaskId == other.loraTaskId && extraKeys == other.extraKeys && cacheSaltID == other.cacheSaltID)
|
|
{
|
|
auto [matchEnd, otherMatchEnd] = std::mismatch(
|
|
uniqueTokens.begin(), uniqueTokens.end(), other.uniqueTokens.begin(), other.uniqueTokens.end());
|
|
numMatched = std::distance(uniqueTokens.begin(), matchEnd);
|
|
}
|
|
return numMatched;
|
|
}
|
|
};
|
|
|
|
std::vector<BlockKey> buildBlockKeys(std::list<VecUniqueTokens>& blockedUniqueTokens, LlmRequest const& llmRequest);
|
|
|
|
// Implement hash functor for BlockKey.
|
|
// This allows us to use unordered_map with BlockKey as key.
|
|
// Based on https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector/72073933#72073933
|
|
struct BlockKeyHasher
|
|
{
|
|
[[nodiscard]] static size_t hash(BlockKey const& blockKey, std::size_t parentHash = 0) noexcept;
|
|
|
|
std::size_t operator()(BlockKey const& blockKey, std::size_t parentHash = 0) const noexcept
|
|
{
|
|
return hash(blockKey, parentHash);
|
|
}
|
|
};
|
|
|
|
using NextBlockMap = std::unordered_map<BlockKey, BlockPtr, BlockKeyHasher>;
|
|
|
|
struct KvCacheStats
|
|
{
|
|
// Number of maximum available blocks in the primary memory pool. This is determined and set by available primary
|
|
// memory. See calculateMaxNumBlocks for details.
|
|
SizeType32 maxNumBlocks;
|
|
// Number of free blocks in the primary memory pool.
|
|
SizeType32 freeNumBlocks;
|
|
// Number of used blocks in the primary memory pool. usedNumBlocks = maxNumBlocks - freeNumBlocks.
|
|
SizeType32 usedNumBlocks;
|
|
SizeType32 toksPerBlock;
|
|
// Total number of blocks allocated by all requests.
|
|
SizeType32 allocTotalBlocks;
|
|
// Number of new blocks that were allocated.
|
|
SizeType32 allocNewBlocks;
|
|
// Number of blocks that were matched and reused.
|
|
SizeType32 reusedBlocks;
|
|
// Number of blocks that were not matched and not reused.
|
|
SizeType32 missedBlocks;
|
|
// Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
|
|
float cacheHitRate;
|
|
// Number of free blocks for every configured attention-window size.
|
|
std::map<SizeType32, SizeType32> numFreeBlocksPerWindowSize;
|
|
// GPU bytes allocated for KV-cache
|
|
std::size_t allocatedBytes{};
|
|
};
|
|
|
|
// Basic building block of a paged KV cache - a single
|
|
// cache block. This class just holds metadata, no pointers
|
|
// since it is reused across all layers.
|
|
class KVCacheBlock
|
|
{
|
|
public:
|
|
using IdType = std::int32_t;
|
|
|
|
static constexpr IdType kCachedBlocksRootId = -1;
|
|
|
|
explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
|
|
|
|
void startScheduling();
|
|
|
|
[[nodiscard]] IdType getBlockId() const;
|
|
|
|
[[nodiscard]] NextBlockMap getNextBlocks() const;
|
|
|
|
[[nodiscard]] kernels::KVCacheIndex::UnderlyingType getMemoryPoolBlockIndex() const;
|
|
|
|
[[nodiscard]] bool isPrimary() const;
|
|
|
|
void swapMemoryPoolBlockOffset(std::shared_ptr<KVCacheBlock> otherBlock);
|
|
|
|
void incRefCount();
|
|
|
|
void decRefCount();
|
|
|
|
void decSchedulingRefCount();
|
|
|
|
[[nodiscard]] bool hasRefs() const;
|
|
|
|
[[nodiscard]] bool hasSchedulingRefs() const;
|
|
|
|
void setBlockKey(BlockKey const& blockKey, bool isFull);
|
|
|
|
BlockKey getBlockKey();
|
|
|
|
[[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;
|
|
|
|
BlockPtr const& getPrevBlock() const;
|
|
|
|
void setPrevBlock(BlockPtr prevBlock);
|
|
|
|
BlockPtr const& getPrevBlockInSeq() const;
|
|
|
|
void setPrevBlockInSeq(BlockPtr prevBlock);
|
|
|
|
void addNextBlock(BlockKey const& blockKey, BlockPtr block);
|
|
|
|
void removeNextBlock(BlockKey const& blockKey);
|
|
|
|
//! \brief Find block matching blockKey. If allowPartial is true, the returned block may match only a prefix of
|
|
//! blockKey.
|
|
//! @return tuple of [partialMatch, numMatched, block], partialMatch is true if not all the tokens of the block were
|
|
//! matched.
|
|
[[nodiscard]] std::tuple<bool, SizeType32, BlockPtr> findMatchingBlock(
|
|
BlockKey const& blockKey, bool enablePartialReuse, bool copyOnPartialReuse) const;
|
|
|
|
//! \brief Free block from previous block if present.
|
|
void freeLeafBlock();
|
|
|
|
[[nodiscard]] bool isFull() const;
|
|
|
|
[[nodiscard]] bool isShared() const;
|
|
|
|
[[nodiscard]] bool isLeaf() const;
|
|
|
|
void setPriority(executor::RetentionPriority priority);
|
|
|
|
[[nodiscard]] executor::RetentionPriority getPriority() const;
|
|
|
|
void setDurationMs(std::optional<std::chrono::milliseconds> durationMs);
|
|
|
|
[[nodiscard]] std::optional<std::chrono::milliseconds> getDurationMs() const;
|
|
|
|
void setExpirationTime(std::optional<std::chrono::steady_clock::time_point::duration> expirationTime);
|
|
|
|
[[nodiscard]] std::optional<std::chrono::steady_clock::time_point::duration> getExpirationTime() const;
|
|
|
|
void setHash(size_t hash);
|
|
|
|
// set hash automatically from block key and previous block in sequence
|
|
void setHash();
|
|
|
|
size_t getHash() const;
|
|
|
|
std::vector<MmKey> getExtraKeys() const;
|
|
|
|
private:
|
|
// Linear ID of block independent of pool
|
|
IdType mBlockId;
|
|
|
|
// Index of block in memory pool backing this block
|
|
// Choice of pool is encoded into the type
|
|
kernels::KVCacheIndex mMemoryPoolBlockIndex;
|
|
|
|
// Number of references to the block
|
|
SizeType32 mRefCount;
|
|
|
|
// Number of references to the block
|
|
SizeType32 mSchedulingRefCount;
|
|
|
|
// Key of this block in mNextBlocks map in block pointed to by mPrevBlock
|
|
BlockKey mBlockKey;
|
|
|
|
// Previous block in reuse tree, or nullptr if not reusing
|
|
BlockPtr mPrevBlock;
|
|
|
|
// Previous block in sequence, == nullptr for first block, == mPrevBlock if reusing and not first
|
|
BlockPtr mPrevBlockInSeq;
|
|
|
|
// Next block(s) in sequence(s)
|
|
NextBlockMap mNextBlocks;
|
|
|
|
// Iterator pointing to this block in mFreeBlocks.
|
|
std::optional<FreeBlocksQueue::iterator> mFreeBlockIterator;
|
|
|
|
// Flag indicating if block is full
|
|
bool mIsFull;
|
|
|
|
// Priority of the block
|
|
executor::RetentionPriority mPriority;
|
|
// Duration that the block's priority level applies for
|
|
std::optional<std::chrono::milliseconds> mDurationMs;
|
|
// Expiration time of the block
|
|
std::optional<std::chrono::steady_clock::time_point::duration> mExpirationTime;
|
|
// Hash for the event manager
|
|
size_t mHash;
|
|
};
|
|
|
|
class GenerationRequest
|
|
{
|
|
public:
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
|
|
explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth,
|
|
std::map<SizeType32, WindowSizeMetadata> const& windowSizeToMetadata,
|
|
executor::KvCacheRetentionConfig kvCacheRetentionConfig = executor::KvCacheRetentionConfig())
|
|
: mRequestId(requestId)
|
|
, mNumTokens(numTokens)
|
|
, mBeamWidth(beamWidth)
|
|
, mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
|
|
, mNumFrontBlocksRemoved(0)
|
|
, mCurrentPrepopulatedPromptLen(std::numeric_limits<SizeType32>::max())
|
|
{
|
|
auto const numWindowSizes = windowSizeToMetadata.size();
|
|
mCacheBlockIds.reserve(numWindowSizes);
|
|
mCacheBlockIndices.reserve(numWindowSizes);
|
|
for (auto const [windowSize, metadata] : windowSizeToMetadata)
|
|
{
|
|
mCacheBlockIds[windowSize] = std::vector<std::vector<KVCacheBlock::IdType>>(beamWidth);
|
|
auto const numPools = metadata.numPools;
|
|
auto const maxBlocks = metadata.maxBlocksPerSeq;
|
|
mCacheBlockIndices[windowSize]
|
|
= runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
|
|
runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value);
|
|
auto cacheBlockIdsRange
|
|
= runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices.at(windowSize));
|
|
std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
|
|
tensorrt_llm::kernels::KVCacheIndex{
|
|
std::numeric_limits<tensorrt_llm::kernels::KVCacheIndex::UnderlyingType>::max()});
|
|
}
|
|
}
|
|
|
|
void addNewTokens(SizeType32 n)
|
|
{
|
|
mNumTokens += n;
|
|
}
|
|
|
|
void removeTokens(SizeType32 n)
|
|
{
|
|
TLLM_CHECK(n <= mNumTokens);
|
|
TLLM_CHECK(mNumTokens - n >= 0);
|
|
mNumTokens -= n;
|
|
}
|
|
|
|
[[nodiscard]] LlmRequest::RequestIdType getRequestId() const
|
|
{
|
|
return mRequestId;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumTokens() const
|
|
{
|
|
return mNumTokens;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumFrontBlocksRemoved() const
|
|
{
|
|
return mNumFrontBlocksRemoved;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getBeamWidth() const
|
|
{
|
|
return mBeamWidth;
|
|
}
|
|
|
|
[[nodiscard]] std::vector<std::vector<SizeType32>> const& getCacheBlockIds(SizeType32 windowSize) const
|
|
{
|
|
return mCacheBlockIds.at(windowSize);
|
|
}
|
|
|
|
[[nodiscard]] runtime::ITensor& getCacheBlockIndices(SizeType32 windowSize)
|
|
{
|
|
return *(mCacheBlockIndices.at(windowSize));
|
|
}
|
|
|
|
[[nodiscard]] runtime::ITensor const& getCacheBlockIndices(SizeType32 windowSize) const
|
|
{
|
|
return *(mCacheBlockIndices.at(windowSize));
|
|
}
|
|
|
|
void addCacheBlock(SizeType32 windowSize, SizeType32 beamIdx, KVCacheBlock::IdType blockId)
|
|
{
|
|
mCacheBlockIds.at(windowSize).at(beamIdx).push_back(blockId);
|
|
}
|
|
|
|
void changeCacheBlock(
|
|
SizeType32 windowSize, SizeType32 beamIdx, SizeType32 pagedBlockIdx, KVCacheBlock::IdType blockId)
|
|
{
|
|
mCacheBlockIds.at(windowSize).at(beamIdx).at(pagedBlockIdx) = blockId;
|
|
}
|
|
|
|
void clearCacheBlocks(SizeType32 windowSize)
|
|
{
|
|
for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
|
|
{
|
|
beamBlockIds.clear();
|
|
}
|
|
mNumFrontBlocksRemoved = 0;
|
|
}
|
|
|
|
void removeFrontBlock(SizeType32 windowSize)
|
|
{
|
|
++mNumFrontBlocksRemoved;
|
|
}
|
|
|
|
void removeLastBlock(SizeType32 windowSize)
|
|
{
|
|
for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
|
|
{
|
|
beamBlockIds.pop_back();
|
|
}
|
|
}
|
|
|
|
[[nodiscard]] executor::RetentionPriority getDecodeRetentionPriority() const
|
|
{
|
|
return mKvCacheRetentionConfig.getDecodeRetentionPriority();
|
|
}
|
|
|
|
[[nodiscard]] std::optional<std::chrono::milliseconds> getDecodeDurationMs() const
|
|
{
|
|
return mKvCacheRetentionConfig.getDecodeDurationMs();
|
|
}
|
|
|
|
[[nodiscard]] executor::KvCacheTransferMode getTransferMode() const
|
|
{
|
|
return mKvCacheRetentionConfig.getTransferMode();
|
|
}
|
|
|
|
[[nodiscard]] std::string const& getDirectory() const
|
|
{
|
|
return mKvCacheRetentionConfig.getDirectory();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getCurrentPrepopulatedPromptLen() const
|
|
{
|
|
return mCurrentPrepopulatedPromptLen;
|
|
}
|
|
|
|
void setCurrentPrepopulatedPromptLen(SizeType32 currentPrepopulatedPromptLen)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(currentPrepopulatedPromptLen <= mCurrentPrepopulatedPromptLen,
|
|
"currentPrepopulatedPromptLen must be updated non-increasingly due to the "
|
|
"assumption that smaller window sizes have shorter or equal"
|
|
"currentPrepopulatedPromptLen in WindowSizeManager::loadOrAllocateBlocks.");
|
|
mCurrentPrepopulatedPromptLen = currentPrepopulatedPromptLen;
|
|
}
|
|
|
|
private:
|
|
// Request id of the sequence
|
|
LlmRequest::RequestIdType mRequestId;
|
|
// Current number of generated tokens
|
|
SizeType32 mNumTokens;
|
|
// Number of beams
|
|
SizeType32 mBeamWidth;
|
|
// List of block ids allocated per each window size, for each beam of the sequence
|
|
std::unordered_map<SizeType32, std::vector<std::vector<KVCacheBlock::IdType>>> mCacheBlockIds;
|
|
// Tensor of block indices allocated per each window size, for each beam of the sequence
|
|
std::unordered_map<SizeType32, runtime::ITensor::SharedPtr> mCacheBlockIndices;
|
|
// The retention priority to assign to decode blocks
|
|
executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
|
|
// Number of front blocks removed from the sequence
|
|
SizeType32 mNumFrontBlocksRemoved;
|
|
// Set of used blocks by the sequence
|
|
std::set<KVCacheBlock::IdType> mUsedBlocks;
|
|
// Current prepopulated prompt length
|
|
SizeType32 mCurrentPrepopulatedPromptLen;
|
|
};
|
|
|
|
// attach metadata to a pool pointer
|
|
class KVCacheBlockPool
|
|
{
|
|
public:
|
|
SizeType32 numLayers;
|
|
SizeType32 kvFactor;
|
|
SizeType32 numKvHeads;
|
|
SizeType32 sizePerHead;
|
|
SizeType32 tokensPerBlock;
|
|
SizeType32 blockSize;
|
|
|
|
// Memory pools. Primary is fast memory, secondary is slower memory used for offloading.
|
|
runtime::ITensor::SharedPtr primaryPtr;
|
|
runtime::ITensor::SharedPtr secondaryPtr;
|
|
|
|
// FP4 KV caches have extra pools that contain second level scales for dequantization.
|
|
bool containsBlockScales;
|
|
bool containsIndexerKCache;
|
|
|
|
KVCacheBlockPool(SizeType32 numLayers, SizeType32 kvFactor, SizeType32 numKvHeads, SizeType32 sizePerHead,
|
|
SizeType32 tokensPerBlock, runtime::ITensor::SharedPtr primaryPtr = nullptr,
|
|
runtime::ITensor::SharedPtr secondaryPtr = nullptr, bool containsBlockScales = false,
|
|
bool containsIndexerKCache = false)
|
|
: numLayers(numLayers)
|
|
, kvFactor(kvFactor)
|
|
, numKvHeads(numKvHeads)
|
|
, sizePerHead(sizePerHead)
|
|
, tokensPerBlock(tokensPerBlock)
|
|
, blockSize(numKvHeads * sizePerHead * tokensPerBlock)
|
|
, primaryPtr(std::move(primaryPtr))
|
|
, secondaryPtr(std::move(secondaryPtr))
|
|
, containsBlockScales(containsBlockScales)
|
|
, containsIndexerKCache(containsIndexerKCache)
|
|
{
|
|
}
|
|
};
|
|
|
|
// The WindowBlockManager manages the metadata of KVCacheBlocks.
|
|
// It manages multiple arrays of cache blocks called pools.
|
|
// Layers with the same number of kv heads are grouped under the same pool.
|
|
// Each pool has shape [max_blocks, num_layers, 2, num_kv_heads, tokens_pre_block, head_size], where num_layers refers
|
|
// to the number of layers with the same num_kv_heads that share that pool.
|
|
// The metadata of KVCacheBlocks is shared between layers, so each block spans all of the managed pool - an allocated
|
|
// block matches some chunk of memory in each pool. The shape of the chunk in every pool is [2, num_kv_heads,
|
|
// tokens_per_block, head_size]. The size per block and number of blocks are pre-determined and set in the constructor.
|
|
// WindowBlockManager maintains a list of free blocks at any time.
|
|
//
|
|
// FP4 KV caches allocate additional pools for block scale factors. These pools have the same
|
|
// shape as the regular KV pools, except that the the last dim is head_size / N where N is determined
|
|
// by the precise FP4 format being used (16 for NVFP4). There is one block scale pool per normal pool.
|
|
//
|
|
// BlockManager maintains a list of free blocks at any time.
|
|
// Alloc pops off the block at the front, and Free pushes it back to the vector.
|
|
// WindowBlockManager maintains a vector of lists of request ids to allocated blocks
|
|
// per sequence. This can be used to Free all blocks belonging to a sequence.
|
|
class WindowBlockManager
|
|
{
|
|
public:
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType;
|
|
using BaseEvictionPolicy = tensorrt_llm::batch_manager::eviction_policy::BaseEvictionPolicy;
|
|
using BlockMap = std::unordered_multimap<size_t, BlockPtr>;
|
|
using BlockMapIterRange = std::pair<BlockMap::const_iterator, BlockMap::const_iterator>;
|
|
|
|
explicit WindowBlockManager(nvinfer1::DataType dtype, SizeType32 windowSize,
|
|
std::vector<SizeType32> const& managedLayers, std::vector<SizeType32> const& numKvHeadsPerLayer,
|
|
SizeType32 sizePerHead, SizeType32 tokensPerBlock, bool isSWA, SizeType32 blocksInPrimaryPool,
|
|
SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
|
|
bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
|
|
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
|
|
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
|
|
std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent = nullptr, bool enableIndexerKCache = false,
|
|
SizeType32 indexerKCacheQuantBlockSize = 128, SizeType32 indexerKCacheIndexHeadDim = 0);
|
|
|
|
~WindowBlockManager();
|
|
|
|
[[nodiscard]] bool isEnableIndexerKCache() const
|
|
{
|
|
return mEnableIndexerKCache;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
|
|
{
|
|
return mIndexerKCacheQuantBlockSize;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
|
|
{
|
|
return mIndexerKCacheIndexHeadDim;
|
|
}
|
|
|
|
void allocatePools(bool useUvm);
|
|
|
|
void releasePools();
|
|
|
|
void createIndexerKCachePools();
|
|
|
|
void startScheduling();
|
|
|
|
//! \brief Assign blocks for new sequence. Try to reuse blocks.
|
|
void addSequence(
|
|
GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks, LlmRequest& llmRequest);
|
|
|
|
//! \brief Assign blocks for new sequence. Does not try to reuse blocks.
|
|
void addSequence(GenerationRequest& sequence, SizeType32 numContextBlocks, bool isShareLastContextBlock);
|
|
|
|
//! \brief Allocate new block for each beam of the sequence.
|
|
//! \details Might free cached blocks if no free blocks are available.
|
|
void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams);
|
|
|
|
void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
|
|
|
|
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
|
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false);
|
|
|
|
void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
|
|
|
|
//! \brief Pin blocks associated with a sequence to prevent eviction.
|
|
void pinBlocks(GenerationRequest& sequence);
|
|
|
|
//! \brief Release blocks of the sequence.
|
|
//! \details When llmRequest is provided and reuse is enabled, blocks will be stored.
|
|
std::optional<KVCacheBlock::IdType> releaseBlocks(
|
|
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
|
|
|
|
//! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
|
|
void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
|
|
|
|
//! \brief Update cache offsets for last block
|
|
void updateLastCacheBlockOffsets(GenerationRequest& seq);
|
|
|
|
//! \brief Release last block in the sequence
|
|
void releaseLastBlock(GenerationRequest& sequence);
|
|
|
|
//! \brief Detach front block from the sequence
|
|
void detachFrontBlock(GenerationRequest& sequence);
|
|
|
|
//! \brief Add/detach block(s) to/from the sequence if needed
|
|
//! \details When we need a new block, we add it. For sliding window
|
|
//! attention (SWA), when a block goes out-of-window (OOW), we detach it
|
|
//! If this called in the first step of the generation phase, we may detach
|
|
//! more than a single block since there may be more than one context block
|
|
//! that goes OOW.
|
|
void adjustBlocksIfNeeded(GenerationRequest& sequence);
|
|
|
|
[[nodiscard]] SizeType32 getWindowSize() const noexcept
|
|
{
|
|
return mWindowSize;
|
|
}
|
|
|
|
[[nodiscard]] std::string const& getLogPrefix() const noexcept
|
|
{
|
|
return mLogPrefix;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept;
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
|
|
{
|
|
return mAllocTotalBlocks;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocNewBlocks() const
|
|
{
|
|
return mAllocNewBlocks;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumReusedBlocks() const noexcept
|
|
{
|
|
return mReusedBlocks;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocatedBlocks() const noexcept
|
|
{
|
|
return getMaxNumBlocks() - getNumFreeBlocks();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
|
|
{
|
|
return mMissedBlocks;
|
|
}
|
|
|
|
[[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept
|
|
{
|
|
return getNumFreeBlocks() >= numRequired;
|
|
}
|
|
|
|
[[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired) const noexcept
|
|
{
|
|
return mSchedulingNumFreeBlocks >= numRequired;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getMaxNumBlocks() const noexcept
|
|
{
|
|
return static_cast<SizeType32>(mAllBlocksById.size());
|
|
}
|
|
|
|
[[nodiscard]] BlockPtr const& getBlockById(KVCacheBlock::IdType blockId) const
|
|
{
|
|
return mAllBlocksById.at(blockId);
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getTokensPerBlock() const noexcept
|
|
{
|
|
return mTokensPerBlock;
|
|
}
|
|
|
|
//! \brief Get size of one K/V cache block in one layer for the specified pool.
|
|
//! @details Volume of [numKvHeads, tokensPerBlock, sizePerHead] in the specified pool.
|
|
[[nodiscard]] SizeType32 getBlockSize(SizeType32 poolIdx) const
|
|
{
|
|
return mPools.at(poolIdx).blockSize;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumEltsPerContainer() const
|
|
{
|
|
#ifdef ENABLE_FP4
|
|
return mDataType == nvinfer1::DataType::kFP4 ? 2 : 1;
|
|
#else
|
|
return 1;
|
|
#endif
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumPools(
|
|
bool includeBlockScalePools = true, bool includeIndexerKCachePools = true) const noexcept
|
|
{
|
|
if (includeBlockScalePools && includeIndexerKCachePools)
|
|
{
|
|
return mPools.size();
|
|
}
|
|
SizeType32 count = 0;
|
|
for (auto const& pool : mPools)
|
|
{
|
|
if (includeBlockScalePools && pool.containsBlockScales)
|
|
{
|
|
count++;
|
|
}
|
|
else if (includeIndexerKCachePools && pool.containsIndexerKCache)
|
|
{
|
|
count++;
|
|
}
|
|
if (!pool.containsBlockScales && !pool.containsIndexerKCache)
|
|
{
|
|
count++;
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
[[nodiscard]] KVCacheBlockPool const& getPool(SizeType32 poolIdx) const
|
|
{
|
|
return mPools.at(poolIdx);
|
|
}
|
|
|
|
[[nodiscard]] bool containsBlockScales(SizeType32 poolIdx) const
|
|
{
|
|
return mPools.at(poolIdx).containsBlockScales;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumPrimaryBlocks() const
|
|
{
|
|
return mNumPrimaryBlocks;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumSecondaryBlocks() const
|
|
{
|
|
return mNumSecondaryBlocks;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const
|
|
{
|
|
return mLayerToPoolIndex.at(layerIdx);
|
|
}
|
|
|
|
//! \brief Maps a global layer index to its layer index within its pool.
|
|
//! \details If we only have one pool, then getPoolLayerIdx(i) == i. Otherwise,
|
|
//! \details gives the layer index into the getLayerPoolIdx(i).
|
|
[[nodiscard]] SizeType32 getPoolLayerIdx(SizeType32 layerIdx) const
|
|
{
|
|
return mLayerToIndexWithinPool.at(layerIdx);
|
|
}
|
|
|
|
void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
|
|
SizeType32 blockIdx, KVCacheBlock::IdType blockId) const;
|
|
|
|
//! \brief Bring offloaded block from secondary to primary memory.
|
|
//! \details Does nothing if block is already in primary memory.
|
|
void onboardBlock(GenerationRequest& sequence, BlockPtr const& offloadBlock,
|
|
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
|
|
|
|
//! \brief Bring block from primary to secondary memory.
|
|
//! \details Does nothing if block is already in secondary memory.
|
|
void offloadBlock(BlockPtr const& block, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
|
|
std::string const& directory = "");
|
|
|
|
//! \brief Find first new block that must be allocated for context phase and return it's concatenated token vectors.
|
|
//! \details Only full blocks are considered.
|
|
[[nodiscard]] std::optional<BlockKey> findNewContextBlock(
|
|
VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const;
|
|
|
|
[[nodiscard]] runtime::BufferManager const& getBufferManager() const
|
|
{
|
|
return mBufferManager;
|
|
}
|
|
|
|
//! \brief Sync internal streams used by transfer manager with buffer manager stream
|
|
void syncTransferManagerWithBufferManager();
|
|
|
|
//! \brief Perform per-request bookkeeping
|
|
void refreshBlocks();
|
|
|
|
[[nodiscard]] static bool blockInRadixTree(BlockPtr const& block);
|
|
|
|
//! \brief Store blocks in cached blocks.
|
|
//! \param blockKeys Key of each block.
|
|
//! \param blockIds Id of each block.
|
|
//! \param pinBlocks If true, increment ref count for blocks while storing (pin on store).
|
|
//! \return Pair of (num blocks stored for reuse, vector of pinned block IDs).
|
|
[[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
|
|
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
|
|
bool pinBlocks = false);
|
|
|
|
[[nodiscard]] bool verifyQueueIntegrity();
|
|
|
|
// Only needed when sliding window attention + paged context fmha are used together.
|
|
// In that case, a temporary kv cache buffer with maximum chunk size (maxNumTokens) is needed.
|
|
// TODO: There are several things that can be improved later.
|
|
// 1. a dynamic temporary kv cache allocation based on real chunk size might be needed.
|
|
// 2. reuse the same temporary kv cache buffer among all layers in the same pool.
|
|
[[nodiscard]] SizeType32 calculateTemporaryAttentionWindow(
|
|
std::optional<TempAttentionWindowInputs> const& inputs) const
|
|
{
|
|
|
|
if (inputs && inputs->pagedContextFMHA && (inputs->maxInputLen > mWindowSize))
|
|
{
|
|
auto window = std::min(inputs->maxNumTokens, inputs->maxInputLen - mWindowSize);
|
|
window = std::max(window, 0); // clamp negative values to 0
|
|
return window;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
//! \brief Return whether this window is SWA.
|
|
[[nodiscard]] bool isSWA() const
|
|
{
|
|
return mIsSWA;
|
|
}
|
|
|
|
[[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(BlockKey const& blockKey);
|
|
|
|
//! \brief Unpin blocks by block ids directly
|
|
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
|
|
|
|
void initializeSequenceStorageValidity(LlmRequest::RequestIdType requestId)
|
|
{
|
|
mIsValidStoreForReuseSequence[requestId] = true;
|
|
}
|
|
|
|
void releaseSequenceStorageValidity(LlmRequest::RequestIdType requestId)
|
|
{
|
|
mIsValidStoreForReuseSequence.erase(requestId);
|
|
}
|
|
|
|
//! \brief Return whether this sequence is valid for store for reuse
|
|
[[nodiscard]] bool isSequenceValidForStoreForReuse(LlmRequest::RequestIdType requestId) const
|
|
{
|
|
TLLM_CHECK_WITH_INFO(mIsValidStoreForReuseSequence.count(requestId) > 0, "Sequence should be bookkeeped");
|
|
return mIsValidStoreForReuseSequence.at(requestId);
|
|
}
|
|
|
|
void resetReuseState()
|
|
{
|
|
std::lock_guard<std::mutex> lock(mCachedBlocksRootMutex);
|
|
mCachedBlocksRoot
|
|
= std::make_shared<KVCacheBlock>(KVCacheBlock::kCachedBlocksRootId, tensorrt_llm::kernels::KVCacheIndex{0});
|
|
}
|
|
|
|
private:
|
|
//! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
|
|
void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
|
|
|
|
//! \brief Add single block to all beams of sequence.
|
|
void addBlockToAllBeams(BlockPtr& block, GenerationRequest& sequence);
|
|
|
|
//! \brief Try to load blocks from cache. Allocate new blocks if necessary.
|
|
//! \param blockKeys Key of each block.
|
|
//! \param sequence Sequence to which blocks are assigned.
|
|
//! \return Number of matched tokens from loaded blocks.
|
|
SizeType32 loadOrAllocateBlocks(std::vector<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
|
|
GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions,
|
|
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
|
|
|
|
//! \brief Free block and all it's descendants. This makes block a claimed leaf block.
|
|
void freeChildren(BlockPtr const& block);
|
|
|
|
//! \brief Find block least likely to be reused, free it if necessary and return.
|
|
//! \param sequence Sequence which the free block is allocated for
|
|
[[nodiscard]] BlockPtr getFreeBlock(GenerationRequest& sequence,
|
|
executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority,
|
|
std::optional<std::chrono::milliseconds> durationMs = std::nullopt,
|
|
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
|
|
|
|
//! \brief Calls KVCacheBlock::freeLeafBlock to remove block from search tree.
|
|
void freeLeafBlock(BlockPtr const& block);
|
|
|
|
//! \brief For FP4 quantization. Creates pool objects for FP4 block scalars.
|
|
void createBlockScalePools(SizeType32 blockSize);
|
|
|
|
private:
|
|
nvinfer1::DataType mDataType;
|
|
SizeType32 mWindowSize;
|
|
|
|
// Number of blocks in pools
|
|
SizeType32 mNumPrimaryBlocks;
|
|
SizeType32 mNumSecondaryBlocks;
|
|
|
|
// List of allocated blocks for each sequences
|
|
std::unordered_map<LlmRequest::RequestIdType, std::vector<BlockPtr>> mAllocatedBlocksPerSeq;
|
|
|
|
// Pool per unique numKvHeads in the model
|
|
std::vector<KVCacheBlockPool> mPools;
|
|
|
|
// Matching layers to their respective pools: {<layer #0>: <pool idx 2>, }, etc.
|
|
std::unordered_map<SizeType32, SizeType32> mLayerToPoolIndex;
|
|
// Matching layers to their index *within* their respective pools: {..., <layer 3>: <idx 2 within pool> }. See
|
|
// getPoolLayerIdx
|
|
std::unordered_map<SizeType32, SizeType32> mLayerToIndexWithinPool;
|
|
|
|
// Whether offloaded blocks should be onboarded before reuse.
|
|
bool mOnboardBlocks;
|
|
// Buffer manager
|
|
runtime::BufferManager mBufferManager;
|
|
|
|
// Used to keep track of number of free blocks during scheduling
|
|
SizeType32 mSchedulingNumFreeBlocks;
|
|
// Number of tokens per one block
|
|
SizeType32 mTokensPerBlock;
|
|
// Whether this window is sliding window attention/full attention
|
|
bool mIsSWA;
|
|
// List of all blocks by idx
|
|
std::vector<BlockPtr> mAllBlocksById;
|
|
// Dummy block acting as root for BlockToken searches
|
|
BlockPtr mCachedBlocksRoot;
|
|
// KV cache type (self or cross)
|
|
CacheType mCacheType;
|
|
// Eviction Policy
|
|
std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
|
|
// Event manager
|
|
std::shared_ptr<KVCacheEventManager> mEventManager;
|
|
// Pointer to parent loopback agent
|
|
std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
|
|
// Transfer manager
|
|
std::shared_ptr<KVCacheTransferManager> mTransferManager;
|
|
|
|
// Statistics for block allocations/reuse
|
|
// Total number of blocks allocated by all requests
|
|
SizeType32 mAllocTotalBlocks;
|
|
// Number of new blocks that were allocated
|
|
SizeType32 mAllocNewBlocks;
|
|
// Number of blocks that were reused
|
|
SizeType32 mReusedBlocks;
|
|
// Number of unique blocks that were reused
|
|
SizeType32 mReusedUniqueBlocks;
|
|
// Number of blocks that were not reused
|
|
SizeType32 mMissedBlocks;
|
|
// Only be 1 or 2. If 2: general KV stored. If 1: K == V for any token, so only K is stored to optimize the
|
|
// max_num_tokens(For DeepSeek). Controlled by mCacheType
|
|
SizeType32 mKVFactor;
|
|
std::set<KVCacheBlock::IdType> reusedBlockIds;
|
|
std::string const mLogPrefix;
|
|
// Number of reused tokens
|
|
double mReusedTokens;
|
|
// Total number of input tokens
|
|
double mTotalInputTokens;
|
|
// Whether blocks that are partially matched should be reused.
|
|
bool mEnablePartialReuse;
|
|
// Whether partially matched blocks that are already in use should be copied and reused.
|
|
bool mCopyOnPartialReuse;
|
|
// The kv cache connector manager
|
|
std::shared_ptr<kv_connector::KvCacheConnectorManager> mKvCacheConnectorManager;
|
|
|
|
// Mutex for the cached blocks root
|
|
std::mutex mCachedBlocksRootMutex;
|
|
|
|
// Record which sequence is using the block
|
|
std::map<KVCacheBlock::IdType, LlmRequest::RequestIdType> mBlockToSequence;
|
|
// Record whether a sequence has all blocks held valid.
|
|
// The boolean value is set to true upon first encounter of a new sequence.
|
|
// It may be invalidated to false when other sequence acquires a block that
|
|
// is used by another sequence.
|
|
std::map<LlmRequest::RequestIdType, bool> mIsValidStoreForReuseSequence;
|
|
|
|
// Whether to enable indexer K cache
|
|
bool mEnableIndexerKCache;
|
|
// Quant block size for indexer K cache
|
|
SizeType32 mIndexerKCacheQuantBlockSize;
|
|
// Index head dim for indexer K cache
|
|
SizeType32 mIndexerKCacheIndexHeadDim;
|
|
};
|
|
|
|
class BlockManager
|
|
{
|
|
public:
|
|
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
using BaseEvictionPolicy = tensorrt_llm::batch_manager::eviction_policy::BaseEvictionPolicy;
|
|
|
|
explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
|
|
SizeType32 tokensPerBlock, BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences,
|
|
CudaStreamPtr stream, SizeType32 maxSequenceLength, SizeType32 maxBeamWidth,
|
|
std::vector<SizeType32> const& maxAttentionWindowVec,
|
|
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
|
|
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
|
|
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
|
|
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
|
|
bool copyOnPartialReuse = true,
|
|
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr,
|
|
std::optional<kvc::BaseAgentConfig> agentConfig = std::nullopt, bool enableIndexerKCache = false,
|
|
SizeType32 indexerKCacheQuantBlockSize = 128, SizeType32 indexerKCacheIndexHeadDim = 0);
|
|
|
|
[[nodiscard]] bool isEnableIndexerKCache() const
|
|
{
|
|
return mIsEnableIndexerKCache;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const
|
|
{
|
|
return mIndexerKCacheQuantBlockSize;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const
|
|
{
|
|
return mIndexerKCacheIndexHeadDim;
|
|
}
|
|
|
|
BlockManager(BlockManager const&) = delete;
|
|
BlockManager& operator=(BlockManager const&) = delete;
|
|
|
|
//! \brief Calculate the proportional share each window size receives of the total memory pool
|
|
//! \details Example: (uniqueWindowSizeToLayers={1024: [1], 4096: [0, 4, 5], 8192: [2, 3]})
|
|
//! Would Return: {1024: 0.0345, 4096: 0.4138, 8192: 0.5517} [sums to 1.0].
|
|
//! See: TEST_F(KVCacheManagerTest, BlockManagerTestWindowSizeToShare).
|
|
//! \return Map<windowSize, share> where share is a float between 0 and 1. Shares sum to 1.0.
|
|
static std::map<SizeType32, float> calculateWindowSizeToShare(
|
|
std::map<SizeType32, std::vector<SizeType32>> const& uniqueWindowSizeToLayers,
|
|
std::map<SizeType32, SizeType32> const& cacheSizePerTokenPerWindowSize);
|
|
|
|
void allocatePools(bool useUvm);
|
|
|
|
void addSequence(GenerationRequest& sequence, SizeType32 inputLength, SizeType32 numContextBlocks,
|
|
LlmRequest& llmRequest, SizeType32 windowSize);
|
|
|
|
//! \brief Assign blocks for a new sequence.
|
|
//! \param sequence The GenerationRequest to process.
|
|
//! \param numContextBlocks Number of context blocks to allocate.
|
|
//! \param windowSize Attention window size
|
|
//! \param isShareLastContextBlock If true, the last context block is shared among beams.
|
|
void addSequence(
|
|
GenerationRequest& sequence, SizeType32 numContextBlocks, SizeType32 windowSize, bool isShareLastContextBlock);
|
|
|
|
void allocateBlock(GenerationRequest& sequence, SizeType32 windowSize);
|
|
|
|
void replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx);
|
|
|
|
std::optional<KVCacheBlock::IdType> releaseBlocks(
|
|
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
|
|
|
|
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
|
GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
|
|
|
|
void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
|
|
|
|
/// @brief Pin all blocks associated with a sequence across all window managers.
|
|
/// @param sequence The generation request whose blocks should be pinned.
|
|
void pinBlocks(GenerationRequest& sequence);
|
|
|
|
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
|
|
|
|
void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
|
|
|
|
void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
|
|
SizeType32 blockIdx, KVCacheBlock::IdType blockId, SizeType32 windowSize) const;
|
|
|
|
// WILL NOT WORK FOR VARIABLE WINDOW ATTENTION
|
|
[[nodiscard]] std::optional<BlockKey> findNewContextBlock(
|
|
VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const;
|
|
|
|
//! \brief Bring block from primary to secondary memory for window size.
|
|
//! \details Does nothing if block is already in primary memory.
|
|
void onboardBlock(GenerationRequest& sequence, BlockPtr const& offloadBlock, SizeType32 windowSize,
|
|
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
|
|
|
|
//! \brief Bring block from primary to secondary memory for window size.
|
|
//! \details Does nothing if block is already in secondary memory.
|
|
void offloadBlock(BlockPtr const& block, SizeType32 windowSize,
|
|
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
|
|
|
|
[[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
|
|
std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
|
|
SizeType32 windowSize, bool pinBlocks = false)
|
|
{
|
|
return mWindowBlockManagers.at(windowSize).storeBlocks(blockKeys, blockIds, pinBlocks);
|
|
}
|
|
|
|
[[nodiscard]] bool verifyQueueIntegrity(SizeType32 windowSize);
|
|
|
|
void releasePools();
|
|
|
|
void startScheduling();
|
|
|
|
[[nodiscard]] std::map<SizeType32, SizeType32> getNumFreeBlocksPerWindowSize() const
|
|
{
|
|
std::map<SizeType32, SizeType32> numFreeBlocksPerWindowSize;
|
|
for (auto const& [windowSize, manager] : mWindowBlockManagers)
|
|
{
|
|
numFreeBlocksPerWindowSize[windowSize] = manager.getNumFreeBlocks();
|
|
}
|
|
return numFreeBlocksPerWindowSize;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumFreeBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getNumFreeBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] bool schedulingHasFreeBlocks(SizeType32 numRequired, SizeType32 windowSize) const
|
|
{
|
|
return mWindowBlockManagers.at(windowSize).schedulingHasFreeBlocks(numRequired);
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getNumAllocTotalBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getFirstWindowSize() const
|
|
{
|
|
if (mWindowBlockManagers.empty())
|
|
{
|
|
return 0;
|
|
}
|
|
return mWindowBlockManagers.begin()->first;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocNewBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getNumAllocNewBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumReusedBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getNumReusedBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumMissedBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getNumMissedBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumLayers() const
|
|
{
|
|
return mNumLayers;
|
|
}
|
|
|
|
[[nodiscard]] CacheType getCacheType() const
|
|
{
|
|
return mCacheType;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const
|
|
{
|
|
auto const& manager = windowManagerByLayer(layerIdx);
|
|
auto const absoluteOffset = absolutePoolsOffset(manager);
|
|
auto const relativePoolIndex = manager.getLayerPoolIdx(layerIdx);
|
|
return absoluteOffset + relativePoolIndex;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getPoolLayerIdx(SizeType32 layerIdx) const
|
|
{
|
|
return windowManagerByLayer(layerIdx).getPoolLayerIdx(layerIdx);
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getTokensPerBlock() const noexcept
|
|
{
|
|
return mTokensPerBlock;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getStreamDevice() const
|
|
{
|
|
return mStream->getDevice();
|
|
}
|
|
|
|
[[nodiscard]] std::deque<executor::KVCacheEvent> getLatestEvents(
|
|
std::optional<std::chrono::milliseconds> timeout) const;
|
|
|
|
void flushIterationEvents()
|
|
{
|
|
if (mEventManager)
|
|
{
|
|
mEventManager->flush();
|
|
}
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getPoolWindowSize(SizeType32 poolIdx) const
|
|
{
|
|
return mAbsolutePoolToWindowSize.at(poolIdx);
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getBlockSize(SizeType32 poolIdx) const
|
|
{
|
|
return getPool(poolIdx).blockSize;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumPools(
|
|
bool includeBlockScalePools = true, bool includeIndexerKCachePools = true) const
|
|
{
|
|
return sumWindows([includeBlockScalePools, includeIndexerKCachePools](auto const& manager)
|
|
{ return manager.getNumPools(includeBlockScalePools, includeIndexerKCachePools); });
|
|
}
|
|
|
|
[[nodiscard]] std::map<SizeType32, WindowSizeMetadata> const& getWindowSizesMetadata() const noexcept
|
|
{
|
|
return mWindowSizeToMetadata;
|
|
}
|
|
|
|
[[nodiscard]] WindowSizeMetadata getWindowSizeMetadata(SizeType32 windowSize) const noexcept
|
|
{
|
|
return mWindowSizeToMetadata.at(windowSize);
|
|
}
|
|
|
|
[[nodiscard]] bool isVariableWindow() const noexcept
|
|
{
|
|
return mIsVariableWindow;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getMaxBlockPerSeqWhenSingleWindowSize() const
|
|
{
|
|
TLLM_CHECK_WITH_INFO(!isVariableWindow(),
|
|
"This function was called assuming there is only a single window size, and therefore a single "
|
|
"maxBlocksPerSeq");
|
|
auto const windowSize = windowManagerByLayer(0).getWindowSize();
|
|
auto const onlyWindowSizeMetadata = getWindowSizeMetadata(windowSize);
|
|
return onlyWindowSizeMetadata.maxBlocksPerSeq;
|
|
}
|
|
|
|
[[nodiscard]] bool isVariableGQA() const noexcept
|
|
{
|
|
return mIsVariableGQA;
|
|
}
|
|
|
|
[[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 poolIdx) const
|
|
{
|
|
return getPool(poolIdx).primaryPtr;
|
|
}
|
|
|
|
[[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool(SizeType32 poolIdx) const
|
|
{
|
|
return getPool(poolIdx).secondaryPtr;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocatedBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getNumAllocatedBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getMaxNumBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getMaxNumBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] BlockPtr const& getBlockById(KVCacheBlock::IdType blockId, SizeType32 windowSize) const
|
|
{
|
|
return mWindowBlockManagers.at(windowSize).getBlockById(blockId);
|
|
}
|
|
|
|
[[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(
|
|
BlockKey const& blockKey, SizeType32 windowSize)
|
|
{
|
|
return mWindowBlockManagers.at(windowSize).findBlocksInReuseTreeByBlockKey(blockKey);
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumPrimaryBlocks() const
|
|
{
|
|
return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); });
|
|
}
|
|
|
|
[[nodiscard]] bool containsBlockScales(SizeType32 poolIdx) const
|
|
{
|
|
return getPool(poolIdx).containsBlockScales;
|
|
}
|
|
|
|
//! \brief Store context blocks
|
|
void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest);
|
|
|
|
//! \brief Store newest block for reuse
|
|
void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
|
|
|
|
//! \brief Sync internal streams used by transfer manager with buffer manager stream
|
|
void syncTransferManagerWithBufferManager();
|
|
|
|
//! \brief Perform per-request bookkeeping
|
|
void refreshBlocks();
|
|
|
|
[[nodiscard]] runtime::BufferManager const& getBufferManager(SizeType32 windowSize) const
|
|
{
|
|
return mWindowBlockManagers.at(windowSize).getBufferManager();
|
|
}
|
|
|
|
[[nodiscard]] KVCacheBlockPool const& getPool(SizeType32 poolIdx) const
|
|
{
|
|
auto const windowSize = getPoolWindowSize(poolIdx);
|
|
auto const relativePoolIndex = mAbsolutePoolToRelativePoolIndex.at(poolIdx);
|
|
return mWindowBlockManagers.at(windowSize).getPool(relativePoolIndex);
|
|
}
|
|
|
|
//! \brief Update cache offsets for blocks initiated from sequence
|
|
void updateSequenceCacheBlockOffsets(GenerationRequest& seq, SizeType32 windowSize);
|
|
|
|
//! \brief Update cache offsets for block at index
|
|
void updateCacheBlockOffsetsAtIdx(GenerationRequest& seq, SizeType32 windowSize, SizeType32 blockIdx);
|
|
|
|
//! \brief Add/detach block(s) to/from the sequence if needed
|
|
//! \details When we need a new block, we add it. For sliding window
|
|
//! attention (SWA), when a block goes out-of-window (OOW), we detach it
|
|
//! If this called in the first step of the generation phase, we may
|
|
//! detach more than a single block since there may be more than one
|
|
//! context block that goes OOW.
|
|
void adjustBlocksIfNeeded(GenerationRequest& sequence);
|
|
|
|
//! \brief Return whether the sequence is already managed by the block manager
|
|
[[nodiscard]] bool isSequenceHeld(LlmRequest::RequestIdType requestId) const
|
|
{
|
|
return mManagedSequences.count(requestId) > 0;
|
|
}
|
|
|
|
//! \brief Add a sequence to the managed sequences
|
|
//! \details Take the sequence into account for the manager. Initialize
|
|
//! sequence storage validity under all window sizes.
|
|
void holdSequence(LlmRequest::RequestIdType requestId)
|
|
{
|
|
mManagedSequences.insert(requestId);
|
|
for (auto const& [windowSize, metadata] : mWindowSizeToMetadata)
|
|
{
|
|
mWindowBlockManagers.at(windowSize).initializeSequenceStorageValidity(requestId);
|
|
}
|
|
}
|
|
|
|
//! \brief Remove a sequence from the managed sequences.
|
|
//! \details Remove sequence from the managed sequences and remove sequence
|
|
//! storage
|
|
void releaseSequence(LlmRequest::RequestIdType requestId)
|
|
{
|
|
mManagedSequences.erase(requestId);
|
|
for (auto const& [windowSize, metadata] : mWindowSizeToMetadata)
|
|
{
|
|
mWindowBlockManagers.at(windowSize).releaseSequenceStorageValidity(requestId);
|
|
}
|
|
}
|
|
|
|
//! \brief Return whether the sequence is still valid for store-for-reuse
|
|
//! regarding the specific window size.
|
|
//! \details Currently this utility function is only used under
|
|
//! kvCacheManagerTest.cpp. Checking for store-for-reuse for each window
|
|
//! size is done in an iterating fashion under BlockManager::releaseBlocks.
|
|
bool isSequenceValidForStoreForReuse(LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
|
|
{
|
|
TLLM_CHECK_WITH_INFO(
|
|
mWindowBlockManagers.count(windowSize) > 0, "Querying window size is not found under mWindowBlockManager");
|
|
return mWindowBlockManagers.at(windowSize).isSequenceValidForStoreForReuse(requestId);
|
|
}
|
|
|
|
void resetReuseState()
|
|
{
|
|
for (auto& [windowSize, manager] : mWindowBlockManagers)
|
|
{
|
|
manager.resetReuseState();
|
|
}
|
|
}
|
|
|
|
private:
|
|
[[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
|
|
{
|
|
return mWindowBlockManagers.at(mLayerToWindowSize.at(layerIdx));
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 sumWindows(std::function<SizeType32(WindowBlockManager const&)> produce) const
|
|
{
|
|
return std::accumulate(mWindowBlockManagers.cbegin(), mWindowBlockManagers.cend(), SizeType32{0},
|
|
[&produce](SizeType32 acc, auto const& manager) { return acc + produce(manager.second); });
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 absolutePoolsOffset(WindowBlockManager const& manager) const
|
|
{
|
|
auto const windowSize = manager.getWindowSize();
|
|
return getWindowSizeMetadata(windowSize).absolutePoolsOffset;
|
|
}
|
|
|
|
private:
|
|
SizeType32 mNumLayers;
|
|
SizeType32 mTokensPerBlock;
|
|
std::shared_ptr<KVCacheEventManager> mEventManager;
|
|
std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
|
|
CudaStreamPtr mStream;
|
|
CacheType mCacheType;
|
|
|
|
bool mIsVariableWindow;
|
|
bool mIsVariableGQA;
|
|
|
|
std::map<SizeType32, WindowBlockManager> mWindowBlockManagers;
|
|
std::map<SizeType32, WindowSizeMetadata> mWindowSizeToMetadata;
|
|
std::vector<SizeType32> mLayerToWindowSize;
|
|
std::vector<SizeType32> mAbsolutePoolToWindowSize;
|
|
std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
|
|
// Record what sequences are currently managed by the block manager
|
|
std::set<LlmRequest::RequestIdType> mManagedSequences;
|
|
|
|
bool mIsEnableIndexerKCache{false};
|
|
SizeType32 mIndexerKCacheQuantBlockSize{0};
|
|
SizeType32 mIndexerKCacheIndexHeadDim{0};
|
|
};
|
|
|
|
struct OffsetTableDimensions
|
|
{
|
|
SizeType32 maxBlocksPerSeq;
|
|
SizeType32 numPools;
|
|
CacheType cacheType;
|
|
};
|
|
|
|
class BaseKVCacheManager
|
|
{
|
|
public:
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
|
|
using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType;
|
|
|
|
virtual ~BaseKVCacheManager() {}
|
|
|
|
virtual void allocatePools(bool useUvm = false) = 0;
|
|
|
|
virtual void releasePools() = 0;
|
|
|
|
virtual void startScheduling() = 0;
|
|
|
|
[[nodiscard]] virtual SizeType32 getTokensPerBlock() const = 0;
|
|
|
|
[[nodiscard]] virtual SizeType32 getMaxNumBlocks() const = 0;
|
|
|
|
[[nodiscard]] virtual SizeType32 getUsedNumBlocks() const = 0;
|
|
|
|
[[nodiscard]] virtual SizeType32 getNumFreeBlocks() const = 0;
|
|
|
|
[[nodiscard]] virtual SizeType32 getNumPools() const = 0;
|
|
|
|
// only used by test
|
|
[[nodiscard]] virtual SizeType32 getNumReusedBlocks() const noexcept = 0;
|
|
|
|
[[nodiscard]] virtual KvCacheStats getKvCacheStats() const = 0;
|
|
|
|
[[nodiscard]] virtual OffsetTableDimensions getOffsetTableDimensions() const = 0;
|
|
|
|
[[nodiscard]] virtual std::deque<executor::KVCacheEvent> getLatestEvents(
|
|
std::optional<std::chrono::milliseconds> timeout = std::nullopt) const
|
|
= 0;
|
|
|
|
[[nodiscard]] virtual BlockManager const& getBlockManager() const = 0;
|
|
|
|
/// @brief Function that computes the number of KV cache blocks needed to advance a request by one or two
|
|
/// iterations
|
|
/// @param req The request for which we need to calculate the number of needed KV cache blocks
|
|
/// @return The number of blocks
|
|
[[nodiscard]] virtual SizeType32 getNeededBlocksOneStep(
|
|
LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const
|
|
= 0;
|
|
|
|
/// @brief Function that computes the number of KV cache blocks needed to advance a request to completion (i.e. for
|
|
/// maxNewTokens)
|
|
/// @param req The request for which we need to calculate the number of needed KV cache blocks
|
|
/// @return The number of blocks
|
|
[[nodiscard]] virtual SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
|
|
= 0;
|
|
|
|
/// @brief Pin blocks associated with a request to prevent eviction.
|
|
/// @param requestId The ID of the request whose blocks should be pinned.
|
|
virtual void pinBlocks(LlmRequest::RequestIdType requestId) = 0;
|
|
|
|
/// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed.
|
|
virtual void addToken(LlmRequest::RequestIdType requestId) = 0;
|
|
|
|
/// @brief Add new request to the KV cache manager.
|
|
/// @param inputLength Input length for which KV cache need to be allocated.
|
|
/// @param beamWidth Beam width for which KV cache need to be allocated.
|
|
/// @param llmRequest Optional request to use for KV cache lookup.
|
|
/// @details If llmRequest is supplied and KV cache reuse is enabled, try to recover KV cache blocks for
|
|
/// inputLength - 1 tokens and populate prepopulatedPromptLen.
|
|
virtual void addSequence(LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
|
|
OptionalRef<LlmRequest> llmRequest = std::nullopt)
|
|
= 0;
|
|
|
|
[[nodiscard]] virtual std::optional<KVCacheBlock::IdType> removeSequence(LlmRequest::RequestIdType requestId,
|
|
OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinOnRelease = false)
|
|
= 0;
|
|
|
|
virtual void schedulingRemoveSequence(LlmRequest::RequestIdType requestId) = 0;
|
|
|
|
[[nodiscard]] virtual runtime::ITensor::SharedPtr getBlockPoolPointers() const = 0;
|
|
|
|
[[nodiscard]] virtual runtime::ITensor::SharedPtr getBlockScalePoolPointers() const = 0;
|
|
|
|
[[nodiscard]] virtual runtime::ITensor::SharedPtr getLayerToPoolMapping() const = 0;
|
|
|
|
virtual void getBlockOffsetsOfBatch(
|
|
runtime::ITensor& output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize, SizeType32 beamWidth) const
|
|
= 0;
|
|
|
|
//! @return maxBlockCount of all beams
|
|
virtual SizeType32 copyBlockOffsets(
|
|
runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const
|
|
= 0;
|
|
|
|
[[nodiscard]] virtual bool isEnableBlockReuse() const = 0;
|
|
|
|
[[nodiscard]] virtual bool isEnableIndexerKCache() const = 0;
|
|
[[nodiscard]] virtual SizeType32 getIndexerKCacheIndexHeadDim() const = 0;
|
|
[[nodiscard]] virtual SizeType32 getIndexerKCacheQuantBlockSize() const = 0;
|
|
|
|
// void removeToken(SizeType32 seqSlotIdx);
|
|
virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
|
|
|
|
[[nodiscard]] virtual GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const = 0;
|
|
[[nodiscard]] virtual GenerationRequest& getSequence(LlmRequest::RequestIdType requestId) = 0;
|
|
|
|
[[nodiscard]] virtual bool isCrossKv() const = 0;
|
|
|
|
//! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector.
|
|
//! \details Only full blocks are considered.
|
|
[[nodiscard]] virtual std::optional<BlockKey> findNewContextBlock(
|
|
VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const
|
|
= 0;
|
|
|
|
//! \brief Store full context blocks contributed by llmRequest.
|
|
//! \details These blocks become reusable from next step.
|
|
virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
|
|
|
|
//! \brief Store newest block for reuse.
|
|
//! \details This block become reusable from next step.
|
|
virtual void storeNewBlock(LlmRequest const& llmRequest) = 0;
|
|
|
|
/// \brief Store blocks for reuse for a given request id
|
|
[[nodiscard]] virtual std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
|
LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false)
|
|
= 0;
|
|
|
|
//! \brief Get the block ids of a request [per beam] **for a given window size block manager**
|
|
[[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
|
|
LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
|
|
= 0;
|
|
|
|
//! \brief Get the block ids of a batch of requests [per beam] **for a given window size block manager**
|
|
[[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
|
|
std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const
|
|
= 0;
|
|
|
|
/// @brief Get the last block id (beam 0) for a given sequence and window size
|
|
[[nodiscard]] virtual std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const
|
|
= 0;
|
|
|
|
[[nodiscard]] virtual runtime::ITensor::SharedPtr getUniquePrimaryPool() const = 0;
|
|
[[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
|
|
[[nodiscard]] virtual runtime::ITensor::SharedPtr getIndexerKCachePool() const = 0;
|
|
[[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
|
|
|
|
virtual void syncTransferManagerWithBufferManager() = 0;
|
|
virtual void refreshBlocks() = 0;
|
|
virtual void flushIterationEvents() = 0;
|
|
virtual void resetReuseState() = 0;
|
|
|
|
[[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
|
|
|
|
// Sum of numLayers * kvFactor * numKvHeads * sizePerHead for each pool
|
|
[[nodiscard]] static SizeType32 calculateCacheSizePerTokenForSingleWindowSize(
|
|
tensorrt_llm::runtime::ModelConfig const& modelConfig, std::vector<SizeType32> const& windowSizeLayers,
|
|
bool isCrossAttention, SizeType32 kvFactor)
|
|
{
|
|
auto const nkvh = modelConfig.getNumKvHeadsForGivenLayers(windowSizeLayers, isCrossAttention);
|
|
auto const sumLocalHeads = std::reduce(nkvh.cbegin(), nkvh.cend());
|
|
// NOTE: We expect the initialization of modelConfig to have already taken the tp size into account and do not
|
|
// address it here
|
|
// consider only local layers for the calculation
|
|
return sumLocalHeads * kvFactor * modelConfig.getSizePerHead();
|
|
}
|
|
|
|
/// @brief Groups model layers by their attention window size.
|
|
/// @param maxAttentionWindowVec Vector of maximum attention window sizes per layer (may have fewer elements than
|
|
/// numLayers, in which case it cycles)
|
|
/// @param numLayers Total number of layers in the model
|
|
/// @return Map from window size to vector of layer indices that use that window size
|
|
[[nodiscard]] static std::map<SizeType32, std::vector<SizeType32>> groupLayersByWindowSize(
|
|
std::vector<SizeType32> const& maxAttentionWindowVec, SizeType32 numLayers);
|
|
|
|
/// @brief Calculate the free memory available for KV cache allocation.
|
|
/// @param bufferManager Buffer manager for memory operations
|
|
/// @param config KV cache configuration parameters
|
|
/// @return Tuple containing the {.freePrimaryMemBytes, .freeSecondaryMemBytes}
|
|
[[nodiscard]] static std::tuple<uint64_t, uint64_t> calculateFreeMemBytes(
|
|
runtime::BufferManager const& bufferManager, executor::KvCacheConfig const& config);
|
|
|
|
/// @brief Calculate the maximum number of KV cache blocks that can be allocated based on available GPU memory.
|
|
/// @details This function computes how many blocks each WindowBlockManager should receive based on the weighted
|
|
/// share
|
|
/// of memory requirements. The weighting considers both the window size and the number of
|
|
/// layers using each window size, as well as the sum of cache sizes per token for each window.
|
|
/// @param config KV cache configuration parameters
|
|
/// @param isCrossAttention Whether this is for cross-attention KV cache
|
|
/// @param dtype Data type used for KV cache values
|
|
/// @param modelConfig Model configuration containing layer and head information
|
|
/// @param worldConfig World configuration for multi-GPU setups
|
|
/// @param windowSizeToLayers Map from attention window size to vector of layer indices using that window size
|
|
/// @param allottedPrimaryMemBytes Allotted primary memory
|
|
/// @param allottedSecondaryMemBytes Allotted secondary memory
|
|
/// @param extraCostMemory Additional memory cost to account for CacheTransBufferManager::preAllocBufferSize
|
|
/// @param kvFactor Factor for KV cache size calculation (typically 2 for key+value)
|
|
/// @return Map from window size to tuple of (primary blocks, secondary blocks)
|
|
[[nodiscard]] static BlocksPerWindow calculateMaxNumBlocks(executor::KvCacheConfig const& config,
|
|
bool isCrossAttention, nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
|
|
tensorrt_llm::runtime::WorldConfig const& worldConfig,
|
|
std::map<SizeType32, std::vector<SizeType32>> const& windowSizeToLayers, uint64_t allottedPrimaryMemBytes,
|
|
uint64_t allottedSecondaryMemBytes, size_t extraCostMemory, SizeType32 kvFactor);
|
|
|
|
/// @brief Calculates the maximum batch size that can fit the kv-cache, given that all sequences in the batch have
|
|
/// the provided input and output length.
|
|
///
|
|
/// @param inputLength The number of input tokens in each sequence in the batch.
|
|
/// @param outputLength The number of output tokens in each sequence in the batch.
|
|
/// @return SizeType32 A number of sequences per batch.
|
|
[[nodiscard]] virtual SizeType32 getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const = 0;
|
|
|
|
[[nodiscard]] virtual CacheType getCacheType() const = 0;
|
|
|
|
[[nodiscard]] virtual std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(
|
|
BlockKey const& blockKey, SizeType32 windowSize)
|
|
= 0;
|
|
|
|
virtual void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) = 0;
|
|
};
|
|
|
|
class KVCacheManager : public BaseKVCacheManager
|
|
{
|
|
public:
|
|
friend class KVCacheManagerBindings;
|
|
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
|
|
using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType;
|
|
|
|
KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
|
|
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
|
|
std::vector<SizeType32> const& maxAttentionWindowVec,
|
|
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
|
|
SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
|
|
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
|
|
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
|
|
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
|
|
bool copyOnpartialReuse = true,
|
|
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr,
|
|
bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
|
|
SizeType32 indexerKCacheIndexHeadDim = 0);
|
|
|
|
KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
|
|
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
|
|
std::vector<SizeType32> const& maxAttentionWindowVec,
|
|
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
|
|
SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
|
|
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
|
|
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
|
|
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
|
|
bool copyOnpartialReuse = true,
|
|
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr,
|
|
bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
|
|
SizeType32 indexerKCacheIndexHeadDim = 0);
|
|
|
|
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
|
|
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
|
|
std::vector<SizeType32> const& maxAttentionWindowVec,
|
|
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
|
|
SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = true,
|
|
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
|
|
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
|
|
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
|
|
bool copyOnpartialReuse = true,
|
|
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr,
|
|
bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
|
|
SizeType32 indexerKCacheIndexHeadDim = 0);
|
|
|
|
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
|
|
BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
|
|
std::vector<SizeType32> const& maxAttentionWindowVec,
|
|
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
|
|
SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
|
|
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true,
|
|
bool copyOnpartialReuse = true, bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
|
|
SizeType32 indexerKCacheIndexHeadDim = 0);
|
|
|
|
~KVCacheManager() override = default;
|
|
|
|
void allocatePools(bool useUvm = false) override;
|
|
|
|
void releasePools() override;
|
|
|
|
void startScheduling() override;
|
|
|
|
[[nodiscard]] SizeType32 getTokensPerBlock() const override
|
|
{
|
|
return mBlockManager.getTokensPerBlock();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getMaxNumBlocks() const override
|
|
{
|
|
return mBlockManager.getMaxNumBlocks();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getUsedNumBlocks() const override
|
|
{
|
|
return mBlockManager.getNumAllocatedBlocks();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumFreeBlocks() const override
|
|
{
|
|
return mBlockManager.getNumFreeBlocks();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumPools() const override
|
|
{
|
|
return mBlockManager.getNumPools();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
|
|
{
|
|
return mBlockManager.getNumAllocTotalBlocks();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumAllocNewBlocks() const
|
|
{
|
|
return mBlockManager.getNumAllocNewBlocks();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumReusedBlocks() const noexcept override
|
|
{
|
|
return mBlockManager.getNumReusedBlocks();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
|
|
{
|
|
return mBlockManager.getNumMissedBlocks();
|
|
}
|
|
|
|
[[nodiscard]] std::map<SizeType32, SizeType32> getNumFreeBlocksPerWindowSize() const
|
|
{
|
|
return mBlockManager.getNumFreeBlocksPerWindowSize();
|
|
}
|
|
|
|
[[nodiscard]] KvCacheStats getKvCacheStats() const override
|
|
{
|
|
KvCacheStats kvCacheStats;
|
|
kvCacheStats.maxNumBlocks = getMaxNumBlocks();
|
|
kvCacheStats.freeNumBlocks = getNumFreeBlocks();
|
|
kvCacheStats.usedNumBlocks = getUsedNumBlocks();
|
|
kvCacheStats.toksPerBlock = getTokensPerBlock();
|
|
kvCacheStats.allocTotalBlocks = getNumAllocTotalBlocks();
|
|
kvCacheStats.allocNewBlocks = getNumAllocNewBlocks();
|
|
kvCacheStats.reusedBlocks = getNumReusedBlocks();
|
|
kvCacheStats.missedBlocks = getNumMissedBlocks();
|
|
kvCacheStats.cacheHitRate = kvCacheStats.reusedBlocks == 0 ? 0
|
|
: static_cast<float>(kvCacheStats.reusedBlocks)
|
|
/ static_cast<float>(kvCacheStats.reusedBlocks + kvCacheStats.missedBlocks);
|
|
kvCacheStats.numFreeBlocksPerWindowSize = getNumFreeBlocksPerWindowSize();
|
|
kvCacheStats.allocatedBytes = mAllocatedBytes;
|
|
return kvCacheStats;
|
|
}
|
|
|
|
[[nodiscard]] OffsetTableDimensions getOffsetTableDimensions() const override
|
|
{
|
|
OffsetTableDimensions dims;
|
|
// We use the mMaxAttentionWindow here, because we prefer to have a single offset table for simplicity,
|
|
// And we don't mind that it should be as wide as the widest window, because that is negligible.
|
|
dims.maxBlocksPerSeq = mBlockManager.getWindowSizeMetadata(mMaxAttentionWindow).maxBlocksPerSeq;
|
|
dims.numPools = mBlockManager.getNumPools();
|
|
dims.cacheType = mBlockManager.getCacheType();
|
|
return dims;
|
|
}
|
|
|
|
[[nodiscard]] std::deque<executor::KVCacheEvent> getLatestEvents(
|
|
std::optional<std::chrono::milliseconds> timeout = std::nullopt) const override
|
|
{
|
|
return mBlockManager.getLatestEvents(timeout);
|
|
}
|
|
|
|
[[nodiscard]] BlockManager const& getBlockManager() const override
|
|
{
|
|
return mBlockManager;
|
|
}
|
|
|
|
/// @brief Function that computes the number of KV cache blocks needed to advance a request by one or two
|
|
/// iterations
|
|
/// @param req The request for which we need to calculate the number of needed KV cache blocks
|
|
/// @return The number of blocks
|
|
[[nodiscard]] SizeType32 getNeededBlocksOneStep(
|
|
LlmRequest const& req, bool twoStepsLookAhead, SizeType32 windowSize) const override;
|
|
|
|
/// @brief Function that computes the number of KV cache blocks remaining to advance a request to completion (i.e.
|
|
/// for maxNewTokens); the allocated blocks are excluded
|
|
/// @param req The request for which we need to calculate the number of needed KV cache blocks
|
|
/// @return The number of blocks
|
|
[[nodiscard]] SizeType32 getRemainingBlocksToCompletion(
|
|
LlmRequest const& req, SizeType32 windowSize) const override;
|
|
|
|
/// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
|
|
void addToken(LlmRequest::RequestIdType requestId) override;
|
|
|
|
/// @brief Add new request to the KV cache manager.
|
|
/// @param inputLength Input length for which KV cache need to be allocated.
|
|
/// @param beamWidth Beam width for which KV cache need to be allocated.
|
|
/// @param llmRequest Optional request to use for KV cache lookup.
|
|
/// @details If llmRequest is supplied and KV cache reuse is enabled, try to recover KV cache blocks for
|
|
/// inputLength - 1 tokens and populate prepopulatedPromptLen.
|
|
void addSequence(LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
|
|
OptionalRef<LlmRequest> llmRequest = std::nullopt) override;
|
|
|
|
[[nodiscard]] std::optional<KVCacheBlock::IdType> removeSequence(LlmRequest::RequestIdType requestId,
|
|
OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinOnRelease = false) override;
|
|
|
|
void schedulingRemoveSequence(LlmRequest::RequestIdType requestId) override;
|
|
|
|
[[nodiscard]] runtime::ITensor::SharedPtr getBlockPoolPointers() const override
|
|
{
|
|
return mBlockPoolPointers;
|
|
}
|
|
|
|
[[nodiscard]] runtime::ITensor::SharedPtr getLayerToPoolMapping() const override
|
|
{
|
|
return mLayerToPoolMapping;
|
|
}
|
|
|
|
[[nodiscard]] runtime::ITensor::SharedPtr getBlockScalePoolPointers() const override
|
|
{
|
|
// TODO: add a new optional model input so the attention plugin can access these
|
|
return mBlockScalePoolPointers;
|
|
}
|
|
|
|
void getBlockOffsetsOfBatch(runtime::ITensor& output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize,
|
|
SizeType32 beamWidth) const override;
|
|
|
|
//! @return maxBlockCount of all beams
|
|
SizeType32 copyBlockOffsets(
|
|
runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const override;
|
|
|
|
[[nodiscard]] bool isEnableBlockReuse() const override
|
|
{
|
|
return mEnableBlockReuse;
|
|
}
|
|
|
|
[[nodiscard]] bool isEnableIndexerKCache() const override
|
|
{
|
|
return mBlockManager.isEnableIndexerKCache();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getIndexerKCacheIndexHeadDim() const override
|
|
{
|
|
return mBlockManager.getIndexerKCacheIndexHeadDim();
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getIndexerKCacheQuantBlockSize() const override
|
|
{
|
|
return mBlockManager.getIndexerKCacheQuantBlockSize();
|
|
}
|
|
|
|
void removeToken(LlmRequest::RequestIdType requestId);
|
|
void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
|
|
|
|
[[nodiscard]] GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const override;
|
|
[[nodiscard]] GenerationRequest& getSequence(LlmRequest::RequestIdType requestId) override;
|
|
|
|
[[nodiscard]] bool isCrossKv() const override
|
|
{
|
|
return mBlockManager.getCacheType() == CacheType::kCROSS;
|
|
}
|
|
|
|
[[nodiscard]] CacheType getCacheType() const override
|
|
{
|
|
return mBlockManager.getCacheType();
|
|
}
|
|
|
|
//! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector.
|
|
//! \details Only full blocks are considered.
|
|
[[nodiscard]] std::optional<BlockKey> findNewContextBlock(
|
|
VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const override;
|
|
|
|
//! \brief Store full context blocks contributed by llmRequest.
|
|
//! \details These blocks become reusable from next step.
|
|
void storeContextBlocks(LlmRequest const& llmRequest) override;
|
|
|
|
//! \brief Store newest blocks for reuse
|
|
void storeNewBlock(LlmRequest const& llmRequest) override;
|
|
|
|
[[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
|
|
LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false) override;
|
|
|
|
[[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
|
|
|
|
[[nodiscard]] SizeType32 getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const override;
|
|
|
|
/// @brief Calculates the number of kv-cache blocks that a sequence will require.
|
|
///
|
|
/// @param inputLength The number of input tokens in the sequence.
|
|
/// @param outputLength The number of output tokens in the sequence.
|
|
/// @param sinkTokenLength The number of sink tokens configured.
|
|
/// @param maxAttentionWindow The attention window size allowed by the model.
|
|
/// @param beamWidth The number of beams to consider for the request.
|
|
/// @param tokensPerBlock The number of tokens a single kv-cache block contains.,
|
|
/// @return SizeType32 A number of blocks.
|
|
[[nodiscard]] static SizeType32 calculateMaxBlockRequirements(SizeType32 inputLength, SizeType32 outputLength,
|
|
SizeType32 sinkTokenLength, SizeType32 windowSize, SizeType32 beamWidth, SizeType32 tokensPerBlock);
|
|
|
|
void pinBlocks(LlmRequest::RequestIdType requestId) override;
|
|
|
|
void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) override;
|
|
|
|
std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const override;
|
|
|
|
/// @brief Calculates the number of kv-cache blocks that a sequence will require, for a single beam.
|
|
///
|
|
/// @param sequenceLength The total length of the sequence (input and output).
|
|
/// @param sinkTokenLength The number of sink tokens configured.
|
|
/// @param windowSize The attention window size
|
|
/// @param tokensPerBlock The number of tokens in a single kv-cache block.
|
|
/// @return SizeType32 A number of blocks.
|
|
[[nodiscard]] static SizeType32 calculateMaxBlockRequirementsPerBeam(
|
|
SizeType32 sequenceLength, SizeType32 sinkTokenLength, SizeType32 windowSize, SizeType32 tokensPerBlock);
|
|
|
|
std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
|
|
LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override;
|
|
|
|
std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
|
|
std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const override;
|
|
|
|
runtime::ITensor::SharedPtr getUniquePrimaryPool() const override;
|
|
runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override;
|
|
runtime::ITensor::SharedPtr getIndexerKCachePool() const override;
|
|
|
|
SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override
|
|
{
|
|
return mBlockManager.getPoolLayerIdx(layer_idx);
|
|
}
|
|
|
|
void syncTransferManagerWithBufferManager() override
|
|
{
|
|
mBlockManager.syncTransferManagerWithBufferManager();
|
|
}
|
|
|
|
//! \brief Perform per-iteration bookkeeping
|
|
void refreshBlocks() override
|
|
{
|
|
mBlockManager.refreshBlocks();
|
|
}
|
|
|
|
void flushIterationEvents() override
|
|
{
|
|
mBlockManager.flushIterationEvents();
|
|
}
|
|
|
|
std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(
|
|
BlockKey const& blockKey, SizeType32 windowSize) override
|
|
{
|
|
return mBlockManager.findBlocksInReuseTreeByBlockKey(blockKey, windowSize);
|
|
}
|
|
|
|
void resetReuseState() override
|
|
{
|
|
mBlockManager.resetReuseState();
|
|
}
|
|
|
|
/// @brief Finds the maximum attention window that can be used on a sequence, given some kv-cache block capacity.
|
|
///
|
|
/// @param inputLength The number of input tokens in the sequence.
|
|
/// @param outputLength The number of output tokens in the sequence.
|
|
/// @param sinkTokenLength The number of sink tokens.
|
|
/// @param blockCapacity The number of kv-cache blocks available.
|
|
/// @param beamWidth The number of beams to consider.
|
|
/// @param tokensPerBlock The number of tokens per kv-cache block.
|
|
/// @return SizeType32 A maximum attention window in number of tokens.
|
|
[[nodiscard]] static SizeType32 calculateMaxAttentionWindow(SizeType32 inputLength, SizeType32 outputLength,
|
|
SizeType32 sinkTokenLength, SizeType32 blockCapacity, SizeType32 beamWidth, SizeType32 tokensPerBlock);
|
|
|
|
private:
|
|
// Maximum number of sequences
|
|
SizeType32 mMaxNumSequences;
|
|
// Maximum beam width
|
|
SizeType32 mMaxBeamWidth;
|
|
nvinfer1::DataType mDataType;
|
|
// Maximum kv cache length per sequence
|
|
SizeType32 mMaxAttentionWindow;
|
|
// Number of tokens per block
|
|
SizeType32 mTokensPerBlock;
|
|
// Number of tokens to fill up the sink tokens to a full block size
|
|
SizeType32 mSinkBubbleLength;
|
|
// Number of tokens in the sink blocks
|
|
SizeType32 mSinkBlockTokenLength;
|
|
// Block manager
|
|
BlockManager mBlockManager;
|
|
// Map of all sequences
|
|
std::unordered_map<LlmRequest::RequestIdType, GenerationRequest> mSequences;
|
|
// Whether to cache KV pages for reuse
|
|
bool mEnableBlockReuse;
|
|
// Mutex to protect access to mSequences
|
|
mutable std::mutex mSequencesMtx;
|
|
// buffers for static tensors, will be created after allocating pools
|
|
runtime::ITensor::SharedPtr mBlockPoolPointers;
|
|
runtime::ITensor::SharedPtr mLayerToPoolMapping;
|
|
runtime::ITensor::SharedPtr mBlockScalePoolPointers;
|
|
runtime::ITensor::SharedPtr mIndexerKCachePoolPointers;
|
|
// GPU bytes allocated for KV-cache
|
|
std::size_t mAllocatedBytes{0};
|
|
};
|
|
|
|
} // namespace tensorrt_llm::batch_manager::kv_cache_manager
|