/* * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/batch_manager/kvCacheUtils.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" namespace tc = tensorrt_llm::common; namespace tr = tensorrt_llm::runtime; using SizeType32 = tensorrt_llm::runtime::SizeType32; using LlmRequest = tensorrt_llm::batch_manager::LlmRequest; using namespace tensorrt_llm::batch_manager::kv_cache_manager; using namespace tensorrt_llm::batch_manager; // --------------------------------------- // BlockIteratorTest // --------------------------------------- class BlockIteratorTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init) { public: void SetUp() override {} void TearDown() override {} }; TEST_F(BlockIteratorTest, BasicTest) { using DataType = int32_t; auto constexpr mNumPrimaryBlocks = 10; auto constexpr mNumLayers = 5; auto constexpr mBlockSize = 32; auto const cacheShape = tr::ITensor::makeShape({mNumPrimaryBlocks, mNumLayers, 2, mBlockSize}); constexpr nvinfer1::DataType dtype{tr::TRTDataType::value}; tr::ITensor::SharedPtr pool = tr::BufferManager::cpu(cacheShape, dtype); std::vector blockIds(mNumPrimaryBlocks); std::iota(blockIds.begin(), blockIds.end(), 0); for (auto idx : blockIds) { auto blockTensor = tr::ITensor::slice(pool, blockIds.at(idx), 1); std::fill_n(tr::bufferCast(*blockTensor), blockTensor->getSize(), idx); } auto range = BlockRangeForWindow(nullptr, 0, std::move(blockIds), std::move(pool)); auto begin = range.begin(); auto end = range.end(); auto allEqualTo = [](tr::ITensor const& tensor, auto x) -> bool { const auto* begin = tr::bufferCast(tensor); const auto* end = begin + tensor.getSize(); return std::all_of(begin, end, [x](auto n) { return n == x; }); }; DataType cnt{0}; for (auto const& tensor : range) { EXPECT_TRUE(allEqualTo(tensor, cnt++)); } } TEST_F(BlockIteratorTest, CacheManagerTest) { auto constexpr dataType = nvinfer1::DataType::kFLOAT; auto constexpr numLayers = 12; auto constexpr numKvHeads = 6; auto constexpr sizePerHead = 16; auto constexpr tokensPerBlock = 4; auto constexpr maxBlocksPerSeq = 4; auto constexpr blocksInPrimaryPool = 8; auto constexpr blocksInSecondaryPool = 0; auto constexpr maxNumSequences = 8; auto constexpr maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq; auto const stream = std::make_shared(); auto constexpr onboardBlocks = true; // TODO: Support and add coverage for beamWidth > 1 auto constexpr beamWidth = 1; auto constexpr numBlocksPerBeam = blocksInPrimaryPool / beamWidth; auto constexpr maxSequenceLength = tokensPerBlock * numBlocksPerBeam; auto const maxAttentionWindowVec = std::vector{maxAttentionWindow}; using BlocksPerWindow = std::map>; const BlocksPerWindow blocksPerWindow = {{maxAttentionWindow, std::make_tuple(blocksInPrimaryPool, blocksInSecondaryPool)}}; BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, stream, maxSequenceLength, beamWidth, maxAttentionWindowVec, std::nullopt, dataType, 0, onboardBlocks); blockManager.allocatePools(false); EXPECT_EQ(blockManager.getTokensPerBlock(), tokensPerBlock); EXPECT_EQ(blockManager.getMaxNumBlocks(), blocksInPrimaryPool); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); SizeType32 constexpr maxNewTokens{0}; tr::SamplingConfig const samplingConfig{beamWidth}; bool constexpr isStreaming{false}; auto inputTokens = std::make_shared(VecTokens{0, 1, 2, 3, 4, 5, 6, 7, 8}); auto const inputLength = static_cast(inputTokens->size()); LlmRequest::RequestIdType requestId{0}; auto llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming); GenerationRequest seq0{requestId, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; auto constexpr beamIdx = 0; auto promptLen0 = llmRequest0->getNumTokens(beamIdx); auto numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); auto const blockIds = seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx); EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2})); auto const pool = blockManager.getPrimaryPool(0); TLLM_CHECK(pool); auto blockIdsVec = std::vector(blockIds.begin(), blockIds.end()); auto poolCopy = pool; auto range = BlockRangeForWindow(nullptr, maxAttentionWindow, std::move(blockIdsVec), std::move(poolCopy)); size_t cnt{0}; for (auto iter = range.begin(); iter != range.end(); ++iter, ++cnt) { EXPECT_EQ(iter->getSize(), numLayers * numKvHeads * sizePerHead * tokensPerBlock * 2); } EXPECT_EQ(cnt, blockIds.size()); }