/* * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "tensorrt_llm/batch_manager/capacityScheduler.h" #include "tensorrt_llm/batch_manager/common.h" #include "tensorrt_llm/batch_manager/kvCacheManager.h" #include "tensorrt_llm/batch_manager/llmRequest.h" #include "tensorrt_llm/batch_manager/peftCacheManager.h" #include "tensorrt_llm/batch_manager/sequenceSlotManager.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/executor/requestUtils.h" #include "tensorrt_llm/executor/types.h" #include #include #include #include #include using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::batch_manager; using tensorrt_llm::executor::insertRequestInOrder; namespace tc = tensorrt_llm::common; using CudaStreamPtr = std::shared_ptr; using VecTokens = std::vector; struct RequestState { int32_t mRequestId; int32_t mMaxNewTokens; int32_t mPromptLen; int32_t mNumGenTokensBegin; int32_t mContextCurrentPosition{0}; }; struct ExpectedState { int32_t itBegin; int32_t itEnd; std::vector activeIds; std::vector scheduledState; std::vector scheduledDisaggGenInitState; }; class MockPeftCacheManager : public BasePeftCacheManager { public: MockPeftCacheManager(SizeType32 numPages = 15, SizeType32 maxDevicePages = 100, SizeType32 maxHostPages = 1000) : mNumPages(numPages) , mMaxDevicePages(maxDevicePages) , mMaxHostPages(maxHostPages) { } void addRequestPeft(std::shared_ptr llmRequest, bool tryGpuCache = true) override {} PeftTable ensureBatch(RequestVector const&, RequestVector const&, bool resetGpuCache = false) override { return PeftTable{}; } void resetDeviceCache() override {} void markRequestDone(LlmRequest const& llmReq, bool pause = false) override {} [[nodiscard]] SizeType32 getMaxDevicePages() const override { return mMaxDevicePages; } [[nodiscard]] SizeType32 getMaxHostPages() const override { return mMaxHostPages; } [[nodiscard]] SizeType32 determineNumPages(std::shared_ptr llmReqeust) const override { return mNumPages; } inline bool enabled() const override { return true; } private: SizeType32 mNumPages; SizeType32 mMaxDevicePages; SizeType32 mMaxHostPages; }; class CapacitySchedulerTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init) { protected: using CapacitySchedulerPolicy = tensorrt_llm::executor::CapacitySchedulerPolicy; CapacitySchedulerTest() {} void SetUp() override {} void TearDown() override {} static std::shared_ptr getKvCacheManager(SizeType32 maxNumRequests, SizeType32 tokensPerBlock, SizeType32 maxNumTokens, SizeType32 maxNumTokensPerSeq, SizeType32 sinkTokenLength = 0, bool enableReuse = false, kv_cache_manager::CacheType cacheType = kv_cache_manager::CacheType::kSELF) { auto const numLayers = 10; auto const nbKvHeads = 10; auto constexpr sizePerHead = 1; auto const maxNumBlocks = tc::divUp(maxNumTokens, tokensPerBlock); auto const kvDtype = nvinfer1::DataType::kHALF; bool onboardBlocks = true; CudaStreamPtr streamPtr = std::make_shared(); using BlocksPerWindow = std::map>; auto const blocksPerWindow = BlocksPerWindow{{maxNumTokensPerSeq, {maxNumBlocks, 0}}}; // init KV cache block manager return std::make_shared(numLayers, nbKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumRequests, 1, std::vector{maxNumTokensPerSeq}, std::nullopt, kvDtype, sinkTokenLength, streamPtr, maxNumTokensPerSeq, enableReuse, onboardBlocks, cacheType); } static std::shared_ptr getPeftCacheManager() { return std::make_shared(); } static std::shared_ptr createRequest(std::shared_ptr> inputTokens, int32_t maxNewTokens, std::optional optionalReqId, std::optional loraTaskId = std::nullopt, tensorrt_llm::executor::PriorityType priority = tensorrt_llm::executor::Request::kDefaultPriority, LlmRequestState state = LlmRequestState::kCONTEXT_INIT) { tensorrt_llm::runtime::SamplingConfig samplingConfig; uint64_t reqId = optionalReqId.value_or((rand() % INT64_MAX) + 1); auto req = std::make_shared(reqId, maxNewTokens, inputTokens, samplingConfig, false); req->setPriority(priority); req->setState(state); if (loraTaskId.has_value()) { req->setLoraTaskId(loraTaskId.value()); } return req; } static std::shared_ptr createRequest(int32_t promptLen, int32_t maxNewTokens, std::optional optionalReqId, std::optional loraTaskId = std::nullopt, tensorrt_llm::executor::PriorityType priority = tensorrt_llm::executor::Request::kDefaultPriority, LlmRequestState state = LlmRequestState::kCONTEXT_INIT) { auto inputTokens = std::make_shared>(promptLen, 1); return createRequest(inputTokens, maxNewTokens, optionalReqId, loraTaskId, priority, state); } static std::shared_ptr createFromExecutorRequest(int32_t promptLen, int32_t maxNewTokens, int32_t encoderInputLen, std::optional optionalReqId, std::optional loraTaskId = std::nullopt, tensorrt_llm::executor::PriorityType priority = tensorrt_llm::executor::Request::kDefaultPriority) { auto inputTokens = VecTokens(promptLen, 1); auto encoderInputTokens = VecTokens(encoderInputLen, 1); tensorrt_llm::executor::OutputConfig outConfig; outConfig.excludeInputFromOutput = false; outConfig.returnLogProbs = false; outConfig.returnGenerationLogits = false; outConfig.returnContextLogits = false; outConfig.returnEncoderOutput = false; uint64_t reqId = optionalReqId.value_or((rand() % INT64_MAX) + 1); bool streaming = false; auto executorReq = tensorrt_llm::executor::Request( inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(), outConfig); executorReq.setEncoderInputTokenIds(encoderInputTokens); auto req = std::make_shared(reqId, executorReq); req->setPriority(priority); if (loraTaskId.has_value()) { req->setLoraTaskId(loraTaskId.value()); } return req; } }; using RequestTable = std::map>; using CheckCallback = std::function; using AddNewRequestsCallback = std::function; void prepRequestsForEncoderSkip(RequestList& activeRequests) { for (auto& req : activeRequests) { if (req->isEncoderInitState() && req->getEncoderTokens()) { TLLM_LOG_INFO("Changing state of request and setting encoder output to skip encoder run"); req->setState(tensorrt_llm::batch_manager::LlmRequestState::kCONTEXT_INIT); } } } int runTest(CapacityScheduler& capacityScheduler, std::shared_ptr const& kvCacheManager, RequestList& activeRequests, std::vector const& expectedStates, AddNewRequestsCallback const& addNewRequestsCb, SizeType32 maxInputLen, std::shared_ptr const& peftCacheManager = nullptr, std::shared_ptr const& crossKvCacheManager = nullptr, bool hasDisaggGenInit = false) { int itCount = 0; while (!activeRequests.empty()) { addNewRequestsCb(activeRequests, itCount); prepRequestsForEncoderSkip(activeRequests); auto [scheduledRequestsList, scheduledDisaggGenInitRequestsLists, pausedRequests] = capacityScheduler(activeRequests, kvCacheManager, peftCacheManager, crossKvCacheManager); RequestTable scheduledRequests; for (auto& req : scheduledRequestsList) { scheduledRequests.emplace(req->mRequestId, req); } RequestTable scheduledDisaggGenInitRequests; for (auto& req : scheduledDisaggGenInitRequestsLists) { scheduledDisaggGenInitRequests.emplace(req->mRequestId, req); } // ------------------ // Check state // ------------------ for (int i = 0; i < expectedStates.size(); ++i) { auto const& expectedState = expectedStates.at(i); // If iteration falls within a range, check state if (itCount >= expectedState.itBegin && itCount < expectedState.itEnd) { std::set previousScheduled; if (i > 0) { for (auto const& state : expectedStates[i - 1].scheduledState) { previousScheduled.insert(state.mRequestId); } } EXPECT_EQ(activeRequests.size(), expectedState.activeIds.size()) << "itCount: " << itCount; EXPECT_EQ(scheduledRequests.size(), expectedState.scheduledState.size()) << "itCount: " << itCount; EXPECT_EQ(scheduledDisaggGenInitRequests.size(), expectedState.scheduledDisaggGenInitState.size()) << "itCount: " << itCount; // Check that active requests are as expected int reqCount = 0; for (auto const& activeReq : activeRequests) { EXPECT_EQ(activeReq->mRequestId, expectedState.activeIds[reqCount]) << "itCount: " << itCount; reqCount++; } // Check that scheduled requests are as expected for (auto scheduledReqState : expectedState.scheduledState) { // Check that scheduleId is found in scheduled Requests EXPECT_NE(scheduledRequests.find(scheduledReqState.mRequestId), scheduledRequests.end()) << "itCount: " << itCount << "mRequestId:" << scheduledReqState.mRequestId; } // Check that scheduled requests are as expected for (auto scheduledReqState : expectedState.scheduledDisaggGenInitState) { // Check that scheduleId is found in scheduled Requests EXPECT_NE( scheduledRequests.find(scheduledReqState.mRequestId), scheduledDisaggGenInitRequests.end()) << "itCount: " << itCount << "mRequestId:" << scheduledReqState.mRequestId; } // Check that all new scheduled are in context init state if (itCount == expectedState.itBegin) { for (auto scheduledReqState : expectedState.scheduledState) { auto scheduledId = scheduledReqState.mRequestId; if (!hasDisaggGenInit) { if (previousScheduled.find(scheduledId) == previousScheduled.end()) { EXPECT_EQ(scheduledRequests.at(scheduledId)->getState(), LlmRequestState::kCONTEXT_INIT) << "itCount: " << itCount << "reqId: " << scheduledId; } else if (!scheduledRequests.at(scheduledId)->getContextRemainingLength()) { EXPECT_EQ(scheduledRequests.at(scheduledId)->getState(), LlmRequestState::kGENERATION_IN_PROGRESS) << "itCount: " << itCount << "reqId: " << scheduledId; } } // Check that request parameters are as expected EXPECT_EQ(scheduledRequests.at(scheduledId)->mMaxNewTokens, scheduledReqState.mMaxNewTokens) << "itCount: " << itCount << ", reqId: " << scheduledId; EXPECT_EQ(scheduledRequests.at(scheduledId)->mPromptLen, scheduledReqState.mPromptLen) << "itCount: " << itCount << ", reqId: " << scheduledId; EXPECT_EQ(scheduledRequests.at(scheduledId)->getMaxNumGeneratedTokens(), scheduledReqState.mNumGenTokensBegin) << "itCount: " << itCount << ", reqId: " << scheduledId; EXPECT_EQ(scheduledRequests.at(scheduledId)->getContextCurrentPosition(), scheduledReqState.mContextCurrentPosition) << "itCount: " << itCount << ", reqId: " << scheduledId; } } } } // Mock the behavior of TrtModelInflightBatching // pause all requests that haven't been scheduled for (auto const& llmReq : pausedRequests) { kvCacheManager->removeSequence(llmReq->mRequestId); if (crossKvCacheManager) { crossKvCacheManager->removeSequence(llmReq->mRequestId); } llmReq->pause(maxInputLen); // All non-scheduled requests should be in state CONTEXT_INIT EXPECT_EQ(llmReq->getState(), LlmRequestState::kCONTEXT_INIT); } // Move scheduled disagg gen init to generation in progress for (auto& [reqId, llmReq] : scheduledDisaggGenInitRequests) { if (llmReq->isDisaggGenerationInitState()) { kvCacheManager->addSequence( llmReq->mRequestId, llmReq->mPromptLen, llmReq->mSamplingConfig.beamWidth, llmReq); llmReq->setState(LlmRequestState::kGENERATION_IN_PROGRESS); llmReq->setContextCurrentPosition(llmReq->mPromptLen); llmReq->setDecodingIter(1); llmReq->addNewTokens({itCount}); } } // Append a token for all scheduled requests for (auto& [reqId, llmReq] : scheduledRequests) { auto const promptLen = llmReq->mPromptLen; if (llmReq->isContextInitState()) { if (!llmReq->isLastContextChunk()) { TLLM_CHECK_WITH_INFO(llmReq->getContextChunkSize() % kvCacheManager->getTokensPerBlock() == 0, "To prevent cache fragmentation, the context chunk size should be divisible by the number " "of " "tokens per block, except for the last chunk."); } if (llmReq->isFirstContextChunk()) { // We need to perform initialization work for the first context chunk. kvCacheManager->addSequence( llmReq->mRequestId, promptLen, llmReq->mSamplingConfig.beamWidth, llmReq); if (crossKvCacheManager) { crossKvCacheManager->addSequence(llmReq->mRequestId, llmReq->getEncoderOutputLen(), llmReq->mSamplingConfig.beamWidth, llmReq); } } auto preContextLength = llmReq->getContextChunkSize(); // Values returned by isFirstContextChunk and isLastContextChunk will change after this call. // This call resets context chunk size to zero for some reason, so need to set it again. llmReq->moveToNextContextChunk(); llmReq->setContextChunkSize(preContextLength); if (llmReq->getContextRemainingLength() == 0) { kvCacheManager->storeContextBlocks(*llmReq); if (crossKvCacheManager) { crossKvCacheManager->storeContextBlocks(*llmReq); } llmReq->addNewTokens({itCount}); llmReq->setState(LlmRequestState::kGENERATION_IN_PROGRESS); } } else { kvCacheManager->addToken(llmReq->mRequestId); llmReq->addNewTokens({itCount}); } if (llmReq->getNumTokens(0) == promptLen + llmReq->mMaxNewTokens) { llmReq->setState(LlmRequestState::kGENERATION_COMPLETE); kvCacheManager->removeSequence(llmReq->mRequestId, llmReq); if (crossKvCacheManager) { crossKvCacheManager->removeSequence(llmReq->mRequestId, llmReq); } } } // Remove completed requests for (auto it = activeRequests.cbegin(); it != activeRequests.cend();) { auto const& llmReq = (*it); if (llmReq->getState() == LlmRequestState::kGENERATION_COMPLETE) { activeRequests.erase(it++); } else { it++; } } itCount++; } return itCount; } TEST_F(CapacitySchedulerTest, SimpleShouldFit) { SizeType32 kvCacheMaxNumTokens = 200; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; auto capacitySchedulerPolicies = std::vector{CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, CapacitySchedulerPolicy::kMAX_UTILIZATION, CapacitySchedulerPolicy::kSTATIC_BATCH}; // TODO: Support and add coverage for sinkTokenLen > 0. (e.g. 4) auto sinkTokenLens = std::vector{0}; for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0, 1234)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 1, 6789)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 80, {0, 1}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 80); } } } TEST_F(CapacitySchedulerTest, SimpleShouldFitWithCrossBlocks) { SizeType32 kvCacheMaxNumTokens = 200; // 90*2=180 + 20 (for one request 10 for sink token as one whole block is reserved so 20 for 2 requests) SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 crossKvCacheMaxNumTokens = 20; // 20 for 2 requests , encoder input len = 2*10 SizeType32 crossKvCacheMaxNumTokensPerSeq = 10; SizeType32 crossKvCacheTokensPerBlock = 10; int32_t encoderInputLen = 10; SizeType32 crossKvsinkTokenLen = 0; // sink tokens not accounted in cross kv cache SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; kv_cache_manager::CacheType cacheType = kv_cache_manager::CacheType::kCROSS; bool enableReuse = false; auto capacitySchedulerPolicies = std::vector{CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT}; // TODO: Support and add coverage for sinkTokenLen > 0. (e.g. 4) auto sinkTokenLens = std::vector{0}; for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto crossKvCacheManager = getKvCacheManager(maxNumRequests, crossKvCacheTokensPerBlock, crossKvCacheMaxNumTokens, crossKvCacheMaxNumTokensPerSeq, crossKvsinkTokenLen, enableReuse, cacheType); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createFromExecutorRequest(promptLen, maxNewTokens, encoderInputLen, 0, 1234)); activeRequests.push_back(createFromExecutorRequest(promptLen, maxNewTokens, encoderInputLen, 1, 6789)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 80, {0, 1}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager, crossKvCacheManager); EXPECT_EQ(numIterations, 80); } } } TEST_F(CapacitySchedulerTest, SimpleLoraFitsDuplicateTask) { SizeType32 kvCacheMaxNumTokens = 210; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 4; SizeType32 maxInputLen = 1000; auto capacitySchedulerPolicies = std::vector{CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, CapacitySchedulerPolicy::kMAX_UTILIZATION, CapacitySchedulerPolicy::kSTATIC_BATCH}; // TODO: Support and add coverage for sinkTokenLen > 0. (e.g. 4) auto sinkTokenLens = std::vector{0}; for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = std::make_shared(15, 30, 30); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 50; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0, 1234)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 1, 5678)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 2, 1234)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 50, {0, 1, 2}, {{0, 50, 10, 0}, {1, 50, 10, 0}, {2, 50, 10, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 50); } } } TEST_F(CapacitySchedulerTest, SimpleLoraDoesntFitDuplicateTask) { SizeType32 kvCacheMaxNumTokens = 210; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 4; SizeType32 maxInputLen = 1000; auto capacitySchedulerPolicies = std::vector{CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, CapacitySchedulerPolicy::kMAX_UTILIZATION, CapacitySchedulerPolicy::kSTATIC_BATCH}; // TODO: Support and add coverage for sinkTokenLen > 0. (e.g. 4) auto sinkTokenLens = std::vector{0}; for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { std::vector expectedStates; uint32_t expectedIterations{0}; if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kMAX_UTILIZATION) { expectedStates.emplace_back(ExpectedState{0, 50, {0, 1, 2}, {{0, 50, 10, 0}}}); expectedStates.emplace_back(ExpectedState{50, 100, {1, 2}, {{1, 50, 10, 0}}}); expectedStates.emplace_back(ExpectedState{100, 150, {2}, {{2, 50, 10, 0}}}); expectedIterations = 150; } else if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT) { // NO_EVICT can guarantee no eviction with requests 0 and 2, so it will allocate them concurrently expectedStates.emplace_back(ExpectedState{0, 50, {0, 1, 2}, {{0, 50, 10, 0}, {2, 50, 10, 0}}}); expectedStates.emplace_back(ExpectedState{50, 100, {1}, {{1, 50, 10, 0}}}); expectedIterations = 100; } else if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kSTATIC_BATCH) { // STATIC_BATCH can guarantee no eviction with requests 0 and 2, so it will allocate them concurrently expectedStates.emplace_back(ExpectedState{0, 50, {0, 1, 2}, {{0, 50, 10, 0}, {2, 50, 10, 0}}}); expectedStates.emplace_back(ExpectedState{50, 100, {1}, {{1, 50, 10, 0}}}); expectedIterations = 100; } for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = std::make_shared(20, 30, 30); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, static_cast(kvCacheManager)); // Create two requests that should not fit in kvCache for entire duration int32_t maxNewTokens = 50; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0, 1234)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 1, 5678)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 2, 1234)); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, expectedIterations); } } } TEST_F(CapacitySchedulerTest, SimpleShouldFitInChunk) { SizeType32 kvCacheMaxNumTokens = 200; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; auto capacitySchedulerPolicies = std::vector{CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, CapacitySchedulerPolicy::kMAX_UTILIZATION, CapacitySchedulerPolicy::kSTATIC_BATCH}; for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { auto kvCacheManager = getKvCacheManager(maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 40; int32_t promptLen = 50; RequestList activeRequests; auto request0 = createRequest(promptLen, maxNewTokens, 0, 1234); auto request1 = createRequest(promptLen, maxNewTokens, 1, 6789); request0->setContextChunkSize(20); request1->setContextChunkSize(20); activeRequests.push_back(std::move(request0)); activeRequests.push_back(std::move(request1)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 1, {0, 1}, {{0, 40, 50, 0, 0}, {1, 40, 50, 0, 0}}}); expectedStates.push_back(ExpectedState{1, 2, {0, 1}, {{0, 40, 50, 0, 20}, {1, 40, 50, 0, 20}}}); expectedStates.push_back(ExpectedState{2, 3, {0, 1}, {{0, 40, 50, 0, 40}, {1, 40, 50, 0, 40}}}); expectedStates.push_back(ExpectedState{3, 4, {0, 1}, {{0, 40, 50, 1, 50}, {1, 40, 50, 1, 50}}}); expectedStates.push_back(ExpectedState{4, 42, {0, 1}, {{0, 40, 50, 2, 50}, {1, 40, 50, 2, 50}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 42); } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitMaxUtilization) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0. (e.g. 4) auto sinkTokenLens = std::vector{0}; for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 1)); std::vector expectedStates; if (sinkTokenLen == 0) { // Up to iteration 41, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 41, {0, 1}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // At iteration 41, running out of space, only one scheduled (req 0) expectedStates.push_back(ExpectedState{41, 80, {0, 1}, {{0, 80, 10, 41, 10}}}); // At iteration 80, req 0 is done expectedStates.push_back(ExpectedState{80, 120, {1}, {{1, 39, 51, 0}}}); } else if (sinkTokenLen == 4) { // Up to iteration 35, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 35, {0, 1}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // At iteration 35, running out of space, only one scheduled (req 0) expectedStates.push_back(ExpectedState{35, 80, {0, 1}, {{0, 80, 10, 35, 10}}}); // At iteration 80, req 0 is done expectedStates.push_back(ExpectedState{80, 126, {1}, {{1, 45, 45, 0}}}); } // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); int expectedNumIters = (sinkTokenLen == 0) ? 119 : 125; EXPECT_EQ(numIterations, expectedNumIters); } } TEST_F(CapacitySchedulerTest, DisaggGenInitMaxUtilization) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; SizeType32 sinkTokenLen = 0; auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0, std::nullopt, tensorrt_llm::executor::Request::kDefaultPriority, LlmRequestState::kDISAGG_GENERATION_INIT)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 1, std::nullopt, tensorrt_llm::executor::Request::kDefaultPriority, LlmRequestState::kDISAGG_GENERATION_INIT)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 1, {0, 1}, {}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // Up to iteration 41, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{1, 41, {0, 1}, {{0, 80, 10, 1, 10}, {1, 80, 10, 1, 10}}}); // At iteration 41, running out of space, only one scheduled (req 0) expectedStates.push_back(ExpectedState{41, 80, {0, 1}, {{0, 80, 10, 41, 10}}}); // At iteration 80, req 0 is done expectedStates.push_back(ExpectedState{80, 120, {1}, {{1, 39, 51, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager, nullptr, true); int expectedNumIters = (sinkTokenLen == 0) ? 119 : 125; EXPECT_EQ(numIterations, expectedNumIters); } TEST_F(CapacitySchedulerTest, RequestsSortedByPriorities) { RequestList activeRequests; insertRequestInOrder(activeRequests, createRequest(10, 20, 0, std::nullopt)); insertRequestInOrder(activeRequests, createRequest(10, 20, 1, std::nullopt)); insertRequestInOrder(activeRequests, createRequest(10, 20, 2, std::nullopt, 0.6)); insertRequestInOrder(activeRequests, createRequest(10, 20, 3, std::nullopt, 0.6)); insertRequestInOrder(activeRequests, createRequest(10, 20, 4, std::nullopt, 0.6)); insertRequestInOrder(activeRequests, createRequest(10, 20, 5, std::nullopt, 1.0)); insertRequestInOrder(activeRequests, createRequest(10, 20, 6, std::nullopt, 0.3)); insertRequestInOrder(activeRequests, createRequest(10, 20, 7, std::nullopt, 0.3)); insertRequestInOrder(activeRequests, createRequest(10, 20, 8, std::nullopt, 0.3)); insertRequestInOrder(activeRequests, createRequest(10, 20, 9, std::nullopt, 0.6)); insertRequestInOrder(activeRequests, createRequest(10, 20, 10, std::nullopt, 1.0)); insertRequestInOrder(activeRequests, createRequest(10, 20, 11, std::nullopt)); std::vector expectedOrder = {5, 10, 2, 3, 4, 9, 0, 1, 11, 6, 7, 8}; int i = 0; for (auto const& a : activeRequests) { EXPECT_EQ(a->mRequestId, expectedOrder[i++]); } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitPriorities) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0 // Removed configuration: // {CapacitySchedulerPolicy::kMAX_UTILIZATION, 4, 125} // {CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, 1, 160} auto sinkTokenLens = std::vector{0}; auto configurations = std::vector>{ {CapacitySchedulerPolicy::kMAX_UTILIZATION, 0, 119}}; for (auto [capacitySchedulerPolicy, sinkTokenLen, expectedNumIters] : configurations) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; insertRequestInOrder(activeRequests, createRequest(promptLen, maxNewTokens, 0, std::nullopt)); insertRequestInOrder(activeRequests, createRequest(promptLen, maxNewTokens, 1, std::nullopt, 0.6)); std::vector expectedStates; if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kMAX_UTILIZATION && sinkTokenLen == 0) { // Up to iteration 41, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 41, {1, 0}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // At iteration 41, running out of space, only one scheduled (req 1, as it has more priority) expectedStates.push_back(ExpectedState{41, 80, {1, 0}, {{1, 80, 10, 41, 10}}}); // At iteration 80, req 1 is done expectedStates.push_back(ExpectedState{80, 120, {0}, {{0, 39, 51, 0}}}); } else if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kMAX_UTILIZATION && sinkTokenLen == 4) { // Up to iteration 35, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 35, {1, 0}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // At iteration 35, running out of space, only one scheduled (req 1, as it has more priority) expectedStates.push_back(ExpectedState{35, 80, {1, 0}, {{1, 80, 10, 35, 10}}}); // At iteration 80, req 1 is done expectedStates.push_back(ExpectedState{80, 126, {0}, {{0, 45, 45, 0}}}); } else if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT) { // It cannot fit both for all iterations, so it should pick req 1 with higher priority expectedStates.push_back(ExpectedState{0, 80, {1, 0}, {{1, 80, 10, 0}}}); // At iteration 80, req 1 is done expectedStates.push_back(ExpectedState{80, 160, {0}, {{0, 80, 10, 0}}}); } // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, expectedNumIters); } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitMaxUtilizationInChunk) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; auto kvCacheManager = getKvCacheManager(maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 60; int32_t promptLen = 30; RequestList activeRequests; auto request0 = createRequest(promptLen, maxNewTokens, 0); auto request1 = createRequest(promptLen, maxNewTokens, 1); request0->setContextChunkSize(20); request1->setContextChunkSize(20); activeRequests.push_back(std::move(request0)); activeRequests.push_back(std::move(request1)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 2, {0, 1}, {{0, 60, 30, 0, 0}, {1, 60, 30, 0, 0}}}); expectedStates.push_back(ExpectedState{2, 22, {0, 1}, {{0, 60, 30, 1, 30}, {1, 60, 30, 1, 30}}}); expectedStates.push_back(ExpectedState{22, 61, {0, 1}, {{0, 60, 30, 21, 30}}}); // The context may be further chunked at runtime, which is distinct from the current situation. expectedStates.push_back(ExpectedState{61, 100, {1}, {{1, 39, 51, 0, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 100); } TEST_F(CapacitySchedulerTest, SimpleDoesntFitMaxUtilizationInChunkedCache) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; auto kvCacheManager = getKvCacheManager(maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 20; int32_t promptLen = 70; RequestList activeRequests; auto request0 = createRequest(promptLen, maxNewTokens, 0); auto request1 = createRequest(promptLen, maxNewTokens, 1); request0->setContextChunkSize(40); request1->setContextChunkSize(40); activeRequests.push_back(std::move(request0)); activeRequests.push_back(std::move(request1)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 1, {0, 1}, {{0, 20, 70, 0, 0}}}); expectedStates.push_back(ExpectedState{1, 2, {0, 1}, {{0, 20, 70, 0, 40}}}); expectedStates.push_back(ExpectedState{2, 21, {0, 1}, {{0, 20, 70, 1, 70}}}); expectedStates.push_back(ExpectedState{21, 22, {1}, {{1, 20, 70, 0, 0}}}); expectedStates.push_back(ExpectedState{22, 23, {1}, {{1, 20, 70, 0, 40}}}); expectedStates.push_back(ExpectedState{23, 24, {1}, {{1, 20, 70, 1, 70}}}); expectedStates.push_back(ExpectedState{24, 42, {1}, {{1, 20, 70, 2, 70}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 42); } TEST_F(CapacitySchedulerTest, SimpleDoesntFitMaxUtilizationDraftTokens) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; SizeType32 numDraftTokens1 = 5; SizeType32 numDraftTokens2 = 10; { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, 0); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; auto const draftTokens1 = std::make_shared>(std::vector(numDraftTokens1)); auto const draftTokens2 = std::make_shared>(std::vector(numDraftTokens2)); RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0)); activeRequests.back()->setDraftTokens(draftTokens1); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 1)); activeRequests.back()->setDraftTokens(draftTokens2); std::vector expectedStates; // Up to iteration 31, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 31, {0, 1}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // At iteration 31, running out of space, only one scheduled (req 0) expectedStates.push_back(ExpectedState{31, 80, {0, 1}, {{0, 80, 10, 31, 10}}}); // At iteration 80, req 0 is done expectedStates.push_back(ExpectedState{80, 129, {1}, {{1, 49, 41, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); int expectedNumIters = 129; EXPECT_EQ(numIterations, expectedNumIters); } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitGuaranteedCompletion) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0. (e.g. 4) auto sinkTokenLens = std::vector{0}; for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 1)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 80, {0, 1}, {{0, 80, 10, 0}}}); // At iteration 80, req 0 is done expectedStates.push_back(ExpectedState{80, 160, {1}, {{1, 80, 10, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 160); } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitWithCrossBlocks) { SizeType32 kvCacheMaxNumTokens = 200; // 90*2=180 + 20 (for one request 10 for sink token as one whole block is reserved so 20 for 2 requests) SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; int32_t encoderInputLen = 10; SizeType32 crossKvCacheMaxNumTokens = 10; // Only one request should be able to fit so 10 is encoder input len, hence 1 block. SizeType32 crossKvCacheTokensPerBlock = 10; SizeType32 crossKvCacheMaxNumTokensPerSeq = 10; SizeType32 crossKvsinkTokenLen = 0; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; kv_cache_manager::CacheType cacheType = kv_cache_manager::CacheType::kCROSS; bool enableReuse = false; auto capacitySchedulerPolicies = std::vector{CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT}; // TODO: Support and add coverage for sinkTokenLen > 0 auto sinkTokenLens = std::vector{0}; for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto crossKvCacheManager = getKvCacheManager(maxNumRequests, crossKvCacheTokensPerBlock, crossKvCacheMaxNumTokens, crossKvCacheMaxNumTokensPerSeq, crossKvsinkTokenLen, enableReuse, cacheType); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createFromExecutorRequest(promptLen, maxNewTokens, encoderInputLen, 0, 1234)); activeRequests.push_back(createFromExecutorRequest(promptLen, maxNewTokens, encoderInputLen, 1, 6789)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 80, {0, 1}, {{0, 80, 10, 0}}}); // At iteration 80, req 0 is done expectedStates.push_back(ExpectedState{80, 160, {1}, {{1, 80, 10, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager, crossKvCacheManager); EXPECT_EQ(numIterations, 160); } } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitGuaranteedCompletionInChunk) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; auto kvCacheManager = getKvCacheManager(maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 60; int32_t promptLen = 30; RequestList activeRequests; auto request0 = createRequest(promptLen, maxNewTokens, 0); auto request1 = createRequest(promptLen, maxNewTokens, 1); request0->setContextChunkSize(20); request1->setContextChunkSize(20); activeRequests.push_back(std::move(request0)); activeRequests.push_back(std::move(request1)); std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 1, {0, 1}, {{0, 60, 30, 0, 0}}}); expectedStates.push_back(ExpectedState{1, 2, {0, 1}, {{0, 60, 30, 0, 20}}}); expectedStates.push_back(ExpectedState{2, 3, {0, 1}, {{0, 60, 30, 1, 30}}}); expectedStates.push_back(ExpectedState{3, 61, {0, 1}, {{0, 60, 30, 2, 30}}}); // At iteration 61, req 0 is done expectedStates.push_back(ExpectedState{61, 62, {1}, {{1, 60, 30, 0, 0}}}); expectedStates.push_back(ExpectedState{62, 63, {1}, {{1, 60, 30, 0, 20}}}); expectedStates.push_back(ExpectedState{63, 64, {1}, {{1, 60, 30, 1, 30}}}); expectedStates.push_back(ExpectedState{64, 122, {1}, {{1, 60, 30, 2, 30}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 122); } TEST_F(CapacitySchedulerTest, SimpleDoesntFitAddingNewRequestsMaxUtilization) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 4; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0 auto sinkTokenLens = std::vector{0}; for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, CapacitySchedulerPolicy::kMAX_UTILIZATION, kvCacheManager != nullptr); // Initially two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList initActiveRequests; initActiveRequests.push_back(createRequest(promptLen, maxNewTokens, 0)); initActiveRequests.push_back(createRequest(promptLen, maxNewTokens, 1)); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this, promptLen, maxNewTokens](RequestList& activeRequests, int itCount) { if (itCount == 10) { activeRequests.push_back(createRequest(promptLen, maxNewTokens, 2)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 3)); } }; std::vector expectedStates; if (sinkTokenLen == 0) { // Up to iteration 10, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 10, {0, 1}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // At iteration 10, two more request get added, fits in KV cache expectedStates.push_back(ExpectedState{ 10, 21, {0, 1, 2, 3}, {{0, 80, 10, 10, 10}, {1, 80, 10, 10, 10}, {2, 80, 10, 0}, {3, 80, 10, 0}}}); // At iteration 21, running out of kv cache // Req 0 and 1 used 31 tokens (62 tokens) // Req 2 and 3 used 21 tokens (42 tokens) // Each sequence need 1 block -> Need to drop two requests expectedStates.push_back(ExpectedState{21, 41, {0, 1, 2, 3}, {{0, 80, 10, 21, 10}, {1, 80, 10, 21, 10}}}); // At iteration 41, running out of Kv cache again // Req 0 used 51 tokens // Req 1 used 51 tokens expectedStates.push_back(ExpectedState{41, 80, {0, 1, 2, 3}, {{0, 80, 10, 41, 10}}}); // At it 80, req 0 is done, 100 free kv tokens, req 1 has 51 tokens (needs 6 blks), req 2 21 tokens (needs 3 // blks), req 3 21 tokens (needs 3 blocks) expectedStates.push_back(ExpectedState{80, 90, {1, 2, 3}, {{1, 39, 51, 0, 0}, {2, 69, 21, 0}}}); // At it 90, running out of kv cache again // req 1 used 61 tokens // req 2 used 31 tokens // So we need two more blocks -> need to drop req 2 expectedStates.push_back(ExpectedState{90, 119, {1, 2, 3}, {{1, 39, 51, 10, 51}}}); // At it 119, req 1 is done // Req 2 used 31 tokens // Req 3 used 21 tokens // Need total of 7 blocks expectedStates.push_back(ExpectedState{119, 139, {2, 3}, {{2, 59, 31, 0}, {3, 69, 21, 0}}}); // At it 139, run out of kv cache again // Req 2 used 51 tokens // Req 3 used 41 tokens // Need two more blocks -> need to drop req 3 expectedStates.push_back(ExpectedState{139, 177, {2, 3}, {{2, 59, 31, 20, 31}}}); // At it 178, req 2 is done // Only req 3 is remaining, -> 41 tokens generated expectedStates.push_back(ExpectedState{178, 227, {3}, {{3, 49, 41, 0}}}); } else if (sinkTokenLen == 4) { // Up to iteration 10, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 10, {0, 1}, {{0, 80, 10, 0, 0}, {1, 80, 10, 0, 0}}}); // At iteration 10, two more request get added, fits in KV cache expectedStates.push_back(ExpectedState{ 10, 15, {0, 1, 2, 3}, {{0, 80, 10, 10, 10}, {1, 80, 10, 10, 10}, {2, 80, 10, 0}, {3, 80, 10, 0}}}); // At iteration 15, running out of kv cache // Req 0 and 1 used 31 tokens (62 tokens), including 6 bubble tokens // Req 2 and 3 used 21 tokens (42 tokens), including 6 bubble tokens // Each sequence need 1 block -> Need to drop two requests expectedStates.push_back(ExpectedState{15, 35, {0, 1, 2, 3}, {{0, 80, 10, 15, 10}, {1, 80, 10, 15, 10}}}); // At iteration 35, running out of Kv cache again // Req 0 used 51 tokens, including 6 bubble tokens // Req 1 used 51 tokens, including 6 bubble tokens expectedStates.push_back(ExpectedState{35, 80, {0, 1, 2, 3}, {{0, 80, 10, 35, 10}}}); // At it 80, req 0 is done, 100 free kv tokens, req 1 has 45 tokens (needs 6 blks), req 2 15 tokens (needs 3 // blks), req 3 15 tokens (needs 3 blocks) expectedStates.push_back(ExpectedState{80, 90, {1, 2, 3}, {{1, 45, 45, 0, 0}, {2, 75, 15, 0}}}); // At it 90, running out of kv cache again // req 1 used 61 tokens, including 6 bubble tokens // req 2 used 31 tokens, including 6 bubble tokens // So we need two more blocks -> need to drop req 2 expectedStates.push_back(ExpectedState{90, 125, {1, 2, 3}, {{1, 45, 45, 10, 45}}}); // At it 119, req 1 is done // Req 2 used 31 tokens, including 6 bubble tokens // Req 3 used 21 tokens, including 6 bubble tokens // Need total of 7 blocks expectedStates.push_back(ExpectedState{125, 145, {2, 3}, {{2, 65, 25, 0, 0}, {3, 75, 15, 0}}}); // At it 145, run out of kv cache again // Req 2 used 51 tokens, including 6 bubble tokens // Req 3 used 41 tokens, including 6 bubble tokens // Need two more blocks -> need to drop req 3 expectedStates.push_back(ExpectedState{145, 189, {2, 3}, {{2, 65, 25, 20, 25}}}); // At it 190, req 2 is done // Only req 3 is remaining, -> 35 tokens generated expectedStates.push_back(ExpectedState{190, 245, {3}, {{3, 55, 35, 0}}}); } // Callback to call after scheduling requests int numIterations = runTest(capacityScheduler, kvCacheManager, initActiveRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); int expectedNumIters = (sinkTokenLen == 0) ? 227 : 245; EXPECT_EQ(numIterations, expectedNumIters); } } TEST_F(CapacitySchedulerTest, SimpleSurpassMaxNumRequestsWithPriorities) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; auto sinkTokenLen = 0; auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, CapacitySchedulerPolicy::kMAX_UTILIZATION, kvCacheManager != nullptr); int32_t maxNewTokens = 10; int32_t promptLen = 10; RequestList initActiveRequests; insertRequestInOrder(initActiveRequests, createRequest(promptLen, maxNewTokens, 0)); insertRequestInOrder(initActiveRequests, createRequest(promptLen, maxNewTokens, 1)); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this, promptLen, maxNewTokens](RequestList& activeRequests, int itCount) { if (itCount == 2) { insertRequestInOrder(activeRequests, createRequest(promptLen, maxNewTokens, 2, std::nullopt, 0.6)); insertRequestInOrder(activeRequests, createRequest(promptLen, maxNewTokens, 3, std::nullopt, 0.7)); } }; std::vector expectedStates; // Up to iteration 2, expect 2 requests expectedStates.push_back(ExpectedState{0, 2, {0, 1}, {{0, 10, 10, 0}, {1, 10, 10, 0}}}); // At iteration 2, two more request get added. They have more priority than // running requests, therefore they are added to activeRequests. The maxNumRequests is 2, // so only the newly added requests should run. expectedStates.push_back(ExpectedState{2, 12, {3, 2, 0, 1}, {{2, 10, 10, 0}, {3, 10, 10, 0}}}); // At iteration 12, reqs 2 and 3 have completed. expectedStates.push_back(ExpectedState{12, 20, {0, 1}, {{0, 8, 12, 0}, {1, 8, 12, 0}}}); // Callback to call after scheduling requests int numIterations = runTest(capacityScheduler, kvCacheManager, initActiveRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 20); } TEST_F(CapacitySchedulerTest, SimpleDoesntFitAddingNewRequestsMaxUtilizationPriorities) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 4; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0 auto sinkTokenLens = std::vector{0}; for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, CapacitySchedulerPolicy::kMAX_UTILIZATION, kvCacheManager != nullptr); // Initially two requests that will not fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList initActiveRequests; insertRequestInOrder(initActiveRequests, createRequest(promptLen, maxNewTokens, 0)); insertRequestInOrder(initActiveRequests, createRequest(promptLen, maxNewTokens, 1)); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this, promptLen, maxNewTokens](RequestList& activeRequests, int itCount) { if (itCount == 10) { insertRequestInOrder(activeRequests, createRequest(promptLen, maxNewTokens, 2, std::nullopt, 0.6)); insertRequestInOrder(activeRequests, createRequest(promptLen, maxNewTokens, 3, std::nullopt, 0.7)); } }; std::vector expectedStates; if (sinkTokenLen == 0) { // Up to iteration 10, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 10, {0, 1}, {{0, 80, 10, 0}, {1, 80, 10, 0}}}); // At iteration 10, two more request get added, fits in KV cache expectedStates.push_back(ExpectedState{ 10, 21, {3, 2, 0, 1}, {{0, 80, 10, 10, 10}, {1, 80, 10, 10, 10}, {2, 80, 10, 0}, {3, 80, 10, 0}}}); // At iteration 21, running out of kv cache // Req 0 and 1 used 31 tokens (62 tokens, 6 blocks) // Req 2 and 3 used 21 tokens (42 tokens, 4 blocks) // Each sequence needs 1 block -> Need to drop a sequence. The lowest priority one is req 1, // which has priority 0.5 and arrived later than req 0. It liberates 3 blocks, enough to continue // with the three ongoing reqs for another 10 tokens for 3 reqs. expectedStates.push_back( ExpectedState{21, 31, {3, 2, 0, 1}, {{0, 80, 10, 21, 10}, {2, 80, 10, 11, 10}, {3, 80, 10, 11, 10}}}); // At iteration 31, running out of Kv cache again // Req 0 used 41 tokens (4 blocks) // Req 2 and 3 used 31 tokens (62 tokens, 6 blocks) // Freeing req 0, with priority 0.5, liberates 4 blocks, good for another 20 tokens for 2 reqs. expectedStates.push_back(ExpectedState{31, 51, {3, 2, 0, 1}, {{2, 80, 10, 21, 10}, {3, 80, 10, 21, 10}}}); // At iteration 51, running out of Kv cache again // Reqs 2 and 3 used 51 tokens (102 tokens, 10 blocks) // Req 2 has priority 0.6 and req 3 has priority 0.7, so req 2 will be freed, liberating 5 blocks. expectedStates.push_back(ExpectedState{51, 90, {3, 2, 0, 1}, {{3, 80, 10, 41, 10}}}); // At it 90, req 3 is done, 100 free kv tokens (10 blocks). // req 2 has 51 tokens (needs 6 blocks) // req 0 has 41 tokens (needs 5 blocks) // req 1 has 31 tokens (needs 4 blocks) // Therefore, only req 2 will be scheduled (req 1 will not be scheduled as it would jump the order). expectedStates.push_back(ExpectedState{90, 129, {2, 0, 1}, {{2, 39, 51, 0}}}); // At it 129, req 2 is done // req 0 has 41 tokens (needs 5 blocks) // req 1 has 31 tokens (needs 4 blocks) // Both req 0 and 1 are scheduled. expectedStates.push_back(ExpectedState{129, 139, {0, 1}, {{0, 49, 41, 0}, {1, 59, 31, 0}}}); // At it 139 // req 0 has 51 tokens (needs 6 blocks) // req 1 has 41 tokens (needs 5 blocks) // Therefore we need to drop one, which is req 1 due to same priority / later arrival. expectedStates.push_back(ExpectedState{139, 178, {0, 1}, {{0, 49, 41, 10, 41}}}); // At it 178, req 0 is done // req 1 has 41 tokens and is scheduled at long last. expectedStates.push_back(ExpectedState{178, 227, {1}, {{1, 49, 41, 0}}}); } else if (sinkTokenLen == 4) { // Up to iteration 10, kvCache is big enough, expect 2 requests expectedStates.push_back(ExpectedState{0, 10, {0, 1}, {{0, 80, 10, 0, 0}, {1, 80, 10, 0, 0}}}); // At iteration 10, two more request get added, fits in KV cache expectedStates.push_back(ExpectedState{ 10, 15, {3, 2, 0, 1}, {{0, 80, 10, 10, 10}, {1, 80, 10, 10, 10}, {2, 80, 10, 0}, {3, 80, 10, 0}}}); // At iteration 15, running out of kv cache // Req 0 and 1 used 31 tokens (62 tokens), including 6 bubble tokens // Req 2 and 3 used 21 tokens (42 tokens), including 6 bubble tokens // Each sequence need 1 block -> Need to drop the lowest prio: req 1 expectedStates.push_back( ExpectedState{15, 25, {3, 2, 0, 1}, {{0, 80, 10, 15, 10}, {2, 80, 10, 5, 10}, {3, 80, 10, 5, 10}}}); // At iteration 25, running out of Kv cache again // Req 0 used 41 tokens, including 6 bubble tokens // Req 2 and 3 used 31 tokens, including 6 bubble tokens // Free req 0, which has the lowest priority. expectedStates.push_back(ExpectedState{25, 45, {3, 2, 0, 1}, {{2, 80, 10, 15, 10}, {3, 80, 10, 15, 10}}}); // At it 45, running out of Kv cache again // Req 2 and 3 used 51 tokens, including 6 bubble tokens // Free req 2 (lowest priority) expectedStates.push_back(ExpectedState{45, 90, {3, 2, 0, 1}, {{3, 80, 10, 35, 10}}}); // At it 90, req 3 finished, 100 tokens up for grabs // req 2 used 51 tokens (including 6 bubble tokens, always assumed from now) (requires 6 blocks) // req 0 used 41 tokens (requires 5 blocks) // req 1 used 31 tokens (requires 4 blocks) // Only req 2 can be scheduled now (higher priority). expectedStates.push_back(ExpectedState{90, 135, {2, 0, 1}, {{2, 45, 45, 0}}}); // At it 135, req 2 is done // req 0 used 41 tokens (requires 5 blocks) // req 1 used 31 tokens (requires 4 blocks) // Both can be scheduled for 10 tokens expectedStates.push_back(ExpectedState{135, 145, {0, 1}, {{0, 55, 35, 0}, {1, 65, 25, 0}}}); // At it 139, run out of kv cache again // req 0 used 51 tokens (requires 6 blocks) // req 1 used 41 tokens (requires 5 blocks) // Free req 1 (lowest priority) expectedStates.push_back(ExpectedState{145, 190, {0, 1}, {{0, 55, 35, 10, 35}}}); // At it 200, req 0 is done // Only req 1 is remaining -> 41 tokens generated expectedStates.push_back(ExpectedState{190, 245, {1}, {{1, 55, 35, 0}}}); } // Callback to call after scheduling requests int numIterations = runTest(capacityScheduler, kvCacheManager, initActiveRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); int expectedNumIters = (sinkTokenLen == 0) ? 227 : 245; EXPECT_EQ(numIterations, expectedNumIters); } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitAddingNewRequestsGuaranteedCompletion) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 4; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0 auto sinkTokenLens = std::vector{0}; for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler( maxNumRequests, CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, kvCacheManager != nullptr); int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList initActiveRequests; initActiveRequests.push_back(createRequest(promptLen, maxNewTokens, 0)); initActiveRequests.push_back(createRequest(promptLen, maxNewTokens, 1)); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this, promptLen, maxNewTokens](RequestList& activeRequests, int itCount) { if (itCount == 10) { activeRequests.push_back(createRequest(promptLen, maxNewTokens, 2)); activeRequests.push_back(createRequest(promptLen, maxNewTokens, 3)); } }; std::vector expectedStates; // Up to iteration 10, expect 1 scheduled, 2 active expectedStates.push_back(ExpectedState{0, 10, {0, 1}, {{0, 80, 10, 0}}}); // At iteration 80, 1 request is done expectedStates.push_back(ExpectedState{10, 80, {0, 1, 2, 3}, {{0, 80, 10, 10, 10}}}); expectedStates.push_back(ExpectedState{80, 160, {1, 2, 3}, {{1, 80, 10, 0}}}); expectedStates.push_back(ExpectedState{160, 240, {2, 3}, {{2, 80, 10, 0}}}); expectedStates.push_back(ExpectedState{240, 320, {3}, {{3, 80, 10, 0}}}); // Callback to call after scheduling requests int numIterations = runTest(capacityScheduler, kvCacheManager, initActiveRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 320); } } TEST_F(CapacitySchedulerTest, SimpleDoesntFitAddingNewRequestsGuaranteedCompletionInChunk) { SizeType32 kvCacheMaxNumTokens = 100; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 4; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0 auto sinkTokenLens = std::vector{0}; for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler( maxNumRequests, CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, kvCacheManager != nullptr); int32_t maxNewTokens = 60; int32_t promptLen = 30; RequestList initActiveRequests; auto request0 = createRequest(promptLen, maxNewTokens, 0); auto request1 = createRequest(promptLen, maxNewTokens, 1); request0->setContextChunkSize(20); request1->setContextChunkSize(20); initActiveRequests.push_back(std::move(request0)); initActiveRequests.push_back(std::move(request1)); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this, promptLen, maxNewTokens](RequestList& activeRequests, int itCount) { if (itCount == 10) { auto request2 = createRequest(promptLen, maxNewTokens, 2); auto request3 = createRequest(promptLen, maxNewTokens, 3); request2->setContextChunkSize(20); request3->setContextChunkSize(20); activeRequests.push_back(std::move(request2)); activeRequests.push_back(std::move(request3)); } }; std::vector expectedStates; // Up to iteration 10, expect 1 scheduled, 2 active expectedStates.push_back(ExpectedState{0, 10, {0, 1}, {{0, 60, 30, 0}}}); // At iteration 80, 1 request is done expectedStates.push_back(ExpectedState{10, 61, {0, 1, 2, 3}, {{0, 60, 30, 9, 30}}}); expectedStates.push_back(ExpectedState{61, 122, {1, 2, 3}, {{1, 60, 30, 0}}}); expectedStates.push_back(ExpectedState{122, 183, {2, 3}, {{2, 60, 30, 0}}}); expectedStates.push_back(ExpectedState{183, 244, {3}, {{3, 60, 30, 0}}}); // Callback to call after scheduling requests int numIterations = runTest(capacityScheduler, kvCacheManager, initActiveRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 244); } } TEST_F(CapacitySchedulerTest, DelayDuplicateRequest) { SizeType32 kvCacheMaxNumTokens = 200; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 50; SizeType32 maxNumRequests = 3; SizeType32 maxInputLen = 1000; bool enableReuse = true; auto capacitySchedulerPolicies = std::vector{CapacitySchedulerPolicy::kMAX_UTILIZATION, CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, CapacitySchedulerPolicy::kSTATIC_BATCH}; auto sinkTokenLens = std::vector{0}; // sinkTokenLen > 0 is not supported with KV cache reuse for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager(maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen, enableReuse); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 2; int32_t promptLen = 2 * kvCacheTokensPerBlock + 1; // must be one greater than kvCacheTokensPerBlock because we don't reuse last input token RequestList activeRequests; auto inputTokens = std::make_shared>(promptLen, 1); std::iota(inputTokens->begin(), inputTokens->end(), 0); activeRequests.push_back(createRequest(inputTokens, maxNewTokens, 0, 1234)); activeRequests.push_back(createRequest(inputTokens, maxNewTokens, 1, 1234)); activeRequests.push_back(createRequest(inputTokens, maxNewTokens, 2, 1234)); std::vector expectedStates; // No delay in static batching. if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kSTATIC_BATCH) { expectedStates.push_back(ExpectedState{0, maxNewTokens, {0, 1, 2}, {{0, maxNewTokens, promptLen, 0}, {1, maxNewTokens, promptLen, 0}, {2, maxNewTokens, promptLen, 0}}}); } else { expectedStates.push_back(ExpectedState{0, 1, {0, 1, 2}, {{0, maxNewTokens, promptLen, 0}}}); expectedStates.push_back(ExpectedState{1, maxNewTokens, {0, 1, 2}, {{0, maxNewTokens, promptLen, 1, promptLen}, {1, maxNewTokens, promptLen, 0}, {2, maxNewTokens, promptLen, 0}}}); expectedStates.push_back(ExpectedState{maxNewTokens, maxNewTokens + 1, {1, 2}, {{1, maxNewTokens, promptLen, maxNewTokens - 1, promptLen}, {2, maxNewTokens, promptLen, maxNewTokens - 1, promptLen}}}); } // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kSTATIC_BATCH) { EXPECT_EQ(numIterations, maxNewTokens); } else { EXPECT_EQ(numIterations, maxNewTokens + 1); } EXPECT_EQ(kvCacheManager->getNumReusedBlocks(), promptLen / kvCacheTokensPerBlock * 2); } } } TEST_F(CapacitySchedulerTest, DelayDuplicateRequestChunked) { SizeType32 kvCacheMaxNumTokens = 200; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; bool enableReuse = true; auto capacitySchedulerPolicies = std::vector{ CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, CapacitySchedulerPolicy::kMAX_UTILIZATION}; for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, 0, enableReuse); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 40; int32_t promptLen = 50; RequestList activeRequests; auto inputTokens0 = std::make_shared>(promptLen, 1); std::iota(inputTokens0->begin(), inputTokens0->end(), 0); auto request0 = createRequest(inputTokens0, maxNewTokens, 0, 1234); auto inputTokens1 = std::make_shared>(promptLen, 1); std::iota(inputTokens1->begin(), inputTokens1->end(), 0); auto request1 = createRequest(inputTokens1, maxNewTokens, 1, 1234); request0->setContextChunkSize(20); request1->setContextChunkSize(20); activeRequests.push_back(std::move(request0)); activeRequests.push_back(std::move(request1)); std::vector expectedStates; // No delay in static batching. if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kSTATIC_BATCH) { expectedStates.push_back(ExpectedState{0, 41, {0, 1}, {{0, 40, 50, 0, 0}, {1, 40, 50, 0, 0}}}); } else { expectedStates.push_back(ExpectedState{0, 1, {0, 1}, {{0, 40, 50, 0, 0}}}); expectedStates.push_back(ExpectedState{1, 2, {0, 1}, {{0, 40, 50, 0, 20}}}); expectedStates.push_back(ExpectedState{2, 3, {0, 1}, {{0, 40, 50, 0, 40}}}); expectedStates.push_back(ExpectedState{3, 4, {0, 1}, {{0, 40, 50, 1, 50}, {1, 40, 50, 0, 0}}}); expectedStates.push_back(ExpectedState{4, 42, {0, 1}, {{0, 40, 50, 2, 50}, {1, 40, 50, 1, 50}}}); expectedStates.push_back(ExpectedState{42, 43, { 1, }, {{1, 40, 50, 39, 50}}}); } // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); if (capacitySchedulerPolicy == CapacitySchedulerPolicy::kSTATIC_BATCH) { EXPECT_EQ(numIterations, 41); EXPECT_EQ(kvCacheManager->getNumReusedBlocks(), 0); } else { EXPECT_EQ(numIterations, 43); EXPECT_EQ(kvCacheManager->getNumReusedBlocks(), 4); } } } TEST_F(CapacitySchedulerTest, DelayFiveRequestsComplicated) { SizeType32 kvCacheMaxNumTokens = 1000; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 150; SizeType32 maxNumRequests = 5; SizeType32 maxInputLen = 1000; bool enableReuse = true; auto capacitySchedulerPolicies = std::vector{ CapacitySchedulerPolicy::kMAX_UTILIZATION, CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT}; auto sinkTokenLens = std::vector{0}; // sinkTokenLen > 0 is not supported with KV cache reuse for (auto capacitySchedulerPolicy : capacitySchedulerPolicies) { for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager(maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen, enableReuse); auto peftCacheManager = getPeftCacheManager(); auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that should fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = kvCacheTokensPerBlock + 1; // must be one greater than kvCacheTokensPerBlock because we don't reuse last input token std::vector promptLens(5); RequestList activeRequests; // produce 2 requests with unique system prompts for (int i = 0; i < 2; ++i) { int32_t promptLen = kvCacheTokensPerBlock * (i + 1) + 1; promptLens[i] = promptLen; auto inputTokens = std::make_shared>(promptLen, 1); std::iota(inputTokens->begin(), inputTokens->end(), i + 1); activeRequests.push_back(createRequest(inputTokens, maxNewTokens, i, 1234)); } // produce 3 requests with matching system prompts for (int i = 0; i < 3; ++i) { int32_t promptLen = kvCacheTokensPerBlock * (i + 1) + 1; promptLens[2 + i] = promptLen; auto inputTokens = std::make_shared>(promptLen, 1); std::iota(inputTokens->begin(), inputTokens->end(), 0); activeRequests.push_back(createRequest(inputTokens, maxNewTokens, 2 + i, 1234)); } std::vector expectedStates; expectedStates.push_back(ExpectedState{0, 1, {0, 1, 2, 3, 4}, {{0, maxNewTokens, promptLens[0], 0, 0}, {1, maxNewTokens, promptLens[1], 0, 0}, {2, maxNewTokens, promptLens[2], 0, 0}}}); expectedStates.push_back(ExpectedState{1, 2, {0, 1, 2, 3, 4}, {{0, maxNewTokens, promptLens[0], 1, promptLens[0]}, {1, maxNewTokens, promptLens[1], 1, promptLens[1]}, {2, maxNewTokens, promptLens[2], 1, promptLens[2]}, {3, maxNewTokens, promptLens[3], 0, 0}}}); expectedStates.push_back(ExpectedState{2, 80, {0, 1, 2, 3, 4}, {{0, maxNewTokens, promptLens[0], 2, promptLens[0]}, {1, maxNewTokens, promptLens[1], 2, promptLens[1]}, {2, maxNewTokens, promptLens[2], 2, promptLens[2]}, {3, maxNewTokens, promptLens[3], 1, promptLens[3]}, {4, maxNewTokens, promptLens[4], 0, 0}}}); expectedStates.push_back(ExpectedState{80, 81, {3, 4}, {{3, maxNewTokens, promptLens[3], 79, promptLens[3]}, {4, maxNewTokens, promptLens[4], 78, promptLens[4]}}}); expectedStates.push_back(ExpectedState{81, 82, {4}, {{4, maxNewTokens, promptLens[4], 79, promptLens[4]}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 82); EXPECT_EQ(kvCacheManager->getNumReusedBlocks(), 3); } } } TEST_F(CapacitySchedulerTest, SimpleFitsStaticBatch) { SizeType32 kvCacheMaxNumTokens = 200; SizeType32 kvCacheTokensPerBlock = 10; SizeType32 kvCacheMaxNumTokensPerSeq = 90; SizeType32 maxNumRequests = 2; SizeType32 maxInputLen = 1000; // TODO: Support and add coverage for sinkTokenLen > 0 auto sinkTokenLens = std::vector{0}; for (auto sinkTokenLen : sinkTokenLens) { auto kvCacheManager = getKvCacheManager( maxNumRequests, kvCacheTokensPerBlock, kvCacheMaxNumTokens, kvCacheMaxNumTokensPerSeq, sinkTokenLen); auto peftCacheManager = getPeftCacheManager(); CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kSTATIC_BATCH; auto capacityScheduler = CapacityScheduler(maxNumRequests, capacitySchedulerPolicy, kvCacheManager != nullptr); // Create two requests that will fit in kvCache for entire duration int32_t maxNewTokens = 80; int32_t promptLen = 10; RequestList activeRequests; activeRequests.push_back(createRequest(promptLen, maxNewTokens, 0)); activeRequests.push_back(createRequest(promptLen, maxNewTokens / 2, 1)); // This request finishes earlier. activeRequests.push_back(createRequest(promptLen, maxNewTokens, 2)); std::vector expectedStates; // Two requests are scheduled together. When both of them finish, the 3rd one is scheduled. expectedStates.push_back(ExpectedState{0, 40, {0, 1, 2}, {{0, 80, 10, 0}, {1, 40, 10, 0}}}); expectedStates.push_back(ExpectedState{41, 79, {0, 2}, {{0, 80, 10, 41, 10}}}); expectedStates.push_back(ExpectedState{80, 160, {2}, {{2, 80, 10, 0}}}); // Callback to call at each iteration, to have option to add new active Requests auto addNewRequestsCb = [this](RequestList& activeRequests, int itCount) {}; int numIterations = runTest(capacityScheduler, kvCacheManager, activeRequests, expectedStates, addNewRequestsCb, maxInputLen, peftCacheManager); EXPECT_EQ(numIterations, 160); } }