mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
This MR is a preliminary MR for implementing the SWA reuse mechanism for the kv cache manager. Please be aware that **no functional change is intended** in this merge request. The purpose of the clean-up is to decouple and remove existing functions for the up-coming SWA KV cache reuse change to be more natural and easier to review. Right now, (1) streamLLM, and (2) beam search with SWA, are broken. We do not want to complicate the code base by stacking more features upon something that does not work. This MR prunes out the logic and add assertions so we can come back and re-support the broken feature and remove the assertion. Since streamLLM (sink attention) is broken now, assertion is added under `KVCacheManager` ctor to guard for the value of `mSinkBlockTokenLength` and `mSinkBubbleLength`. Compute logics relate to it are pruned. The beam search with SWA will still be broke when introducing the SWA KV cache reuse. We will revisit this problem in the future. On top of this, we should make an effort to update the [supporting matrix](https://github.com/NVIDIA/TensorRT-LLM/blob/feat/1.0_doc_dev/docs/source/1.0/features/feature-combination-matrix.md) of the kv cache manager after merging the support of SWA KV cache reuse. Changes are listed as following: - Separate `KVCacheManager::updateToken` into `KVCacheManager::addToken` and `KVCacheManager::removeToken`. The functionality should be decoupled. - Push utility `cacheSequenceBlockOffsets` and `cacheNewBlockOffset` from `KVCacheManager` down to `WindowBlockManager`. `KVCacheManager`-exposed functions should be real utilities that users of the structure can leverage. Implementation-detailed function calls should not exist at this level. - Simplify "is shared last context block" logic under `KVCacheManager::addSequence`. Since no functional change is intended in this merge request, no test case is added. Several comments are added for future test coverage reminder. For `LlmRequestTest.ParamTest`, `streaming=True` is commented out because we guard sink attention with assertion now. In `capacitySchedulerTest`, `addToken` action to `crossKVCacheManager` is removed because in encoder-decoder model, generation tokens are added only to the decoder and not to the encoder. Signed-off-by: eopXD <yuehtingc@nvidia.com>
135 lines
5.5 KiB
C++
135 lines
5.5 KiB
C++
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
|
|
*
|
|
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
|
* property and proprietary rights in and to this material, related
|
|
* documentation and any modifications thereto. Any use, reproduction,
|
|
* disclosure or distribution of this material and related documentation
|
|
* without an express license agreement from NVIDIA CORPORATION or
|
|
* its affiliates is strictly prohibited.
|
|
*/
|
|
|
|
#include "tensorrt_llm/batch_manager/kvCacheUtils.h"
|
|
|
|
#include <gmock/gmock.h>
|
|
#include <gtest/gtest.h>
|
|
|
|
#include "tensorrt_llm/common/cudaUtils.h"
|
|
|
|
namespace tc = tensorrt_llm::common;
|
|
namespace tr = tensorrt_llm::runtime;
|
|
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
|
using LlmRequest = tensorrt_llm::batch_manager::LlmRequest;
|
|
using namespace tensorrt_llm::batch_manager::kv_cache_manager;
|
|
using namespace tensorrt_llm::batch_manager;
|
|
|
|
// ---------------------------------------
|
|
// BlockIteratorTest
|
|
// ---------------------------------------
|
|
|
|
class BlockIteratorTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init)
|
|
{
|
|
public:
|
|
void SetUp() override {}
|
|
|
|
void TearDown() override {}
|
|
};
|
|
|
|
TEST_F(BlockIteratorTest, BasicTest)
|
|
{
|
|
using DataType = int32_t;
|
|
auto constexpr mNumPrimaryBlocks = 10;
|
|
auto constexpr mNumLayers = 5;
|
|
auto constexpr mBlockSize = 32;
|
|
auto const cacheShape = tr::ITensor::makeShape({mNumPrimaryBlocks, mNumLayers, 2, mBlockSize});
|
|
constexpr nvinfer1::DataType dtype{tr::TRTDataType<DataType>::value};
|
|
tr::ITensor::SharedPtr pool = tr::BufferManager::cpu(cacheShape, dtype);
|
|
std::vector<SizeType32> blockIds(mNumPrimaryBlocks);
|
|
std::iota(blockIds.begin(), blockIds.end(), 0);
|
|
for (auto idx : blockIds)
|
|
{
|
|
auto blockTensor = tr::ITensor::slice(pool, blockIds.at(idx), 1);
|
|
std::fill_n(tr::bufferCast<DataType>(*blockTensor), blockTensor->getSize(), idx);
|
|
}
|
|
auto range = BlockRange(pool, blockIds);
|
|
auto begin = range.begin();
|
|
auto end = range.end();
|
|
auto allEqualTo = [](tr::ITensor const& tensor, auto x) -> bool
|
|
{
|
|
const auto* begin = tr::bufferCast<decltype(x)>(tensor);
|
|
const auto* end = begin + tensor.getSize();
|
|
return std::all_of(begin, end, [x](auto n) { return n == x; });
|
|
};
|
|
DataType cnt{0};
|
|
for (auto const& tensor : range)
|
|
{
|
|
EXPECT_TRUE(allEqualTo(tensor, cnt++));
|
|
}
|
|
}
|
|
|
|
TEST_F(BlockIteratorTest, CacheManagerTest)
|
|
{
|
|
auto constexpr dataType = nvinfer1::DataType::kFLOAT;
|
|
auto constexpr numLayers = 12;
|
|
auto constexpr numKvHeads = 6;
|
|
auto constexpr sizePerHead = 16;
|
|
auto constexpr tokensPerBlock = 4;
|
|
auto constexpr maxBlocksPerSeq = 4;
|
|
auto constexpr blocksInPrimaryPool = 8;
|
|
auto constexpr blocksInSecondaryPool = 0;
|
|
auto constexpr maxNumSequences = 8;
|
|
auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
|
|
|
|
auto const stream = std::make_shared<tr::CudaStream>();
|
|
auto constexpr onboardBlocks = true;
|
|
|
|
// TODO: Support and add coverage for beamWidth > 1
|
|
auto constexpr beamWidth = 1;
|
|
auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth;
|
|
auto constexpr maxSequenceLength = tokensPerBlock * numBlocksPerBeam;
|
|
auto const maxAttentionWindowVec = std::vector<BlockManager::SizeType32>{maxAttentionWindow};
|
|
|
|
using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
|
|
const BlocksPerWindow blocksPerWindow
|
|
= {{maxAttentionWindow, std::make_tuple(blocksInPrimaryPool, blocksInSecondaryPool)}};
|
|
|
|
BlockManager blockManager(std::vector<BlockManager::SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock,
|
|
blocksPerWindow, maxNumSequences, stream, maxSequenceLength, beamWidth, maxAttentionWindowVec, std::nullopt,
|
|
dataType, 0, onboardBlocks);
|
|
blockManager.allocatePools(false);
|
|
|
|
EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock);
|
|
EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool);
|
|
EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
|
|
|
|
SizeType32 constexpr maxNewTokens{0};
|
|
tr::SamplingConfig const samplingConfig{beamWidth};
|
|
bool constexpr isStreaming{false};
|
|
|
|
auto inputTokens = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8});
|
|
auto const inputLength = static_cast<SizeType32>(inputTokens->size());
|
|
LlmRequest::RequestIdType requestId{0};
|
|
auto llmRequest0 = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
|
|
|
|
GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()};
|
|
|
|
auto constexpr beamIdx = 0;
|
|
auto promptLen0 = llmRequest0->getNumTokens(beamIdx);
|
|
auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock());
|
|
blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow);
|
|
|
|
auto const blockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx);
|
|
EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2}));
|
|
|
|
auto const pool = blockManager.getPrimaryPool(0);
|
|
TLLM_CHECK(pool);
|
|
auto range = BlockRange(pool, blockIds);
|
|
size_t cnt{0};
|
|
for (auto iter = range.begin(); iter != range.end(); ++iter, ++cnt)
|
|
{
|
|
EXPECT_EQ(iter->getSize(), numLayers * numKvHeads * sizePerHead * tokensPerBlock * 2);
|
|
}
|
|
EXPECT_EQ(cnt, blockIds.size());
|
|
}
|