/* * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement * * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual * property and proprietary rights in and to this material, related * documentation and any modifications thereto. Any use, reproduction, * disclosure or distribution of this material and related documentation * without an express license agreement from NVIDIA CORPORATION or * its affiliates is strictly prohibited. */ #ifndef TOP_LEVEL_DIR #error "Define TOP_LEVEL_DIR" #endif #include "executorTest.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/executor/dataTransceiverState.h" #include "tensorrt_llm/executor/requestWithId.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/executor/version.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" #include "tensorrt_llm/runtime/utils/numpyUtils.h" #include "tensorrt_llm/testing/modelSpec.h" #include "tests/utils/common.h" #include #include #include #include #include #include #include #include #include #include #include namespace tr = tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; using namespace tensorrt_llm::testing; using namespace tensorrt_llm::executor; using namespace std::chrono_literals; namespace fs = std::filesystem; using tensorrt_llm::testing::KVCacheType; using tensorrt_llm::testing::ModelSpec; namespace { auto const LORA_DATA_PATH = DATA_PATH / "lora-test-weights-gpt2-tp1"; auto const LORA_WEIGHTS_FILE = LORA_DATA_PATH / "source.npy"; auto const LORA_CONFIG_FILE = LORA_DATA_PATH / "config.npy"; auto constexpr LLAMA_INPUT_FILE = "input_tokens_llama.npy"; auto constexpr LLAMA_VOCAB_SIZE_PADDED = 128256; auto constexpr LLAMA_PAD_ID = 128001; auto constexpr LLAMA_END_ID = 128001; } // namespace void testInvalidCtor(std::filesystem::path const& enginePath, ModelType modelType, ExecutorConfig executorConfig, std::string expectedErrMsg = "") { try { auto executor = Executor(enginePath, modelType, executorConfig); FAIL() << "Expected TllmException"; } catch (std::exception const& e) { EXPECT_THAT(e.what(), testing::HasSubstr(expectedErrMsg)); } } TEST_F(GptExecutorTest, version) { EXPECT_STRNE(kTensorRtLlmVersion, "@TRTLLM_VERSION@"); EXPECT_STREQ(kTensorRtLlmVersion, version()); } TEST_F(GptExecutorTest, validCtor) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); } TEST_F(GptExecutorTest, invalidCtor) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; std::filesystem::path invalidPath{"Bla"}; // Invalid path { testInvalidCtor(invalidPath, ModelType::kDECODER_ONLY, executorConfig, "File does not exist"); } } TEST_F(GptExecutorTest, enqueueAfterShutdown) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth)); auto requestId = executor.enqueueRequest(request); bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { FAIL(); } else { done = response.getResult().isFinal; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); executor.shutdown(); EXPECT_FALSE(executor.canEnqueueRequests()); std::string expErrMsg{"Shutdown called"}; EXPECT_THAT([&]() { auto reqId = executor.enqueueRequest(request); }, testing::Throws( testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg)))); EXPECT_THAT([&]() { auto resp = executor.awaitResponses(); }, testing::Throws( testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg)))); EXPECT_THAT([&]() { auto stats = executor.getLatestIterationStats(); }, testing::Throws( testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg)))); EXPECT_THAT([&]() { auto stats = executor.getLatestRequestStats(); }, testing::Throws( testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg)))); EXPECT_THAT([&]() { executor.cancelRequest(requestId); }, testing::Throws( testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg)))); } TEST_F(GptExecutorTest, missingPeftTask) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_LORA_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the request SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth)); auto loraConfig = LoraConfig{10}; request.setLoraConfig(loraConfig); auto requestId = executor.enqueueRequest(request); bool done = false; std::chrono::milliseconds waitTime(mMaxWaitMs); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { auto err = response.getErrorMsg(); EXPECT_EQ(err, std::string("LoRA task 10 not found in cache. Please send LoRA weights with request")); done = true; } else { FAIL() << "Expects error due to missing Lora weights"; } } EXPECT_TRUE(done); } TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits) { SizeType32 constexpr beamWidth{1}; SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded // Create executor config auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setGatherGenerationLogits(true); // Enable kv cache reuse of executorConfig bool enableBlockReuse = true; FloatType freeGpuMemoryFraction = 0.4; auto kvCacheConfig = KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction); executorConfig.setKvCacheConfig(kvCacheConfig); // Create executor auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DRAFT_TOKENS_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create request SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4, 5, 6, 7, 8}; std::vector streamingOptions{false, true}; for (auto streaming : streamingOptions) { auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth)); // Set draft tokens auto draftTokens = VecTokens{9, 10, 11, 12, 13}; // draft tokens auto draftLength = draftTokens.size(); FloatType const acceptanceThreshold = 0.00001f; // Ensure the draft token can be accepted auto externalDraftTokensConfig = ExternalDraftTokensConfig(draftTokens, std::nullopt, acceptanceThreshold); request.setExternalDraftTokensConfig(externalDraftTokensConfig); // Set return accepted token logits for this request OutputConfig outConfig; outConfig.returnGenerationLogits = true; request.setOutputConfig(outConfig); // Enqueue this request auto requestId = executor.enqueueRequest(request); bool done = false; int iter = 0; while (!done && iter < 5000) { std::chrono::milliseconds waitTime(mMaxWaitMs); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { FAIL(); } else { auto result = response.getResult(); done = result.isFinal; auto& genLogits = result.generationLogits; EXPECT_TRUE(genLogits.has_value()); // Expected shape: (1, numAcceptedDraftToken, vocabSizePadded) auto const& acceptedTokenLogitsShape = genLogits->getShape(); EXPECT_EQ(acceptedTokenLogitsShape.size(), 3); EXPECT_EQ(acceptedTokenLogitsShape[0], 1); EXPECT_LE(acceptedTokenLogitsShape[1], draftLength); // number of accepted tokens EXPECT_EQ(acceptedTokenLogitsShape[2], vocabSizePadded); // vocabSizePadded } } ++iter; } } } TEST_F(GptExecutorTest, GenerationLogitsEarlyStop) { SizeType32 constexpr beamWidth{1}; SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded auto constexpr streaming = false; ExtendedRuntimePerfKnobConfig perfKnobConfig = ExtendedRuntimePerfKnobConfig(); // Create executor config auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setExtendedRuntimePerfKnobConfig(perfKnobConfig); executorConfig.setGatherGenerationLogits(true); // Create executor auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); auto const inputPath = DATA_PATH / "input_tokens.npy"; ModelIds modelIds{50256, 50256}; auto manager = tr::BufferManager(std::make_shared()); auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU); auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId); auto const* const givenInputData = tr::bufferCast(*givenInput); auto const& inputShape = givenInput->getShape(); ASSERT_EQ(inputShape.nbDims, 2); ASSERT_GT(inputShape.d[0], 0); BeamResult beamResult{beamWidth}; auto const resultsPath = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth)); beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE(); beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE(); beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE(); // Set return generation logits for this request OutputConfig outConfig; outConfig.returnGenerationLogits = true; outConfig.excludeInputFromOutput = true; // Load expected outputs for each beam width value auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds); auto const maxSeqLen = testData.maxSeqLen; // Load expected outputs and inputs std::vector requests; std::vector reqMaxNewTokens; auto constexpr reqIdx = 0; SizeType32 inputLen = givenInputLengths.at(reqIdx); auto maxNewTokens = maxSeqLen - maxInputLength; reqMaxNewTokens.push_back(maxNewTokens); auto const* const seqBegin = givenInputData + reqIdx * maxInputLength; auto request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId); // copy request auto request2 = request; auto const expectedOutputData = tr::BufferRange(*testData.expectedOutputIds); auto const expectedOutputLengths = testData.expectedOutputLengths; auto const endPos = expectedOutputLengths[reqIdx] - 3; auto const endIndex = tc::flat_index3(reqIdx, beamWidth - 1, endPos, beamWidth, maxSeqLen); auto const endToken = expectedOutputData[endIndex]; // Set end id to stop early request.setEndId(endToken); requests.emplace_back(std::move(request)); // Set stop words to stop early request2.setStopWords({{endToken}}); requests.emplace_back(std::move(request2)); // Enqueue requests auto requestIds = executor.enqueueRequests(requests); std::map expectedNewTokens; expectedNewTokens[requestIds.at(0)] = endPos - inputLen; expectedNewTokens[requestIds.at(1)] = endPos - inputLen + 1; std::map expectedFinishReason; expectedFinishReason[requestIds.at(0)] = FinishReason::kEND_ID; expectedFinishReason[requestIds.at(1)] = FinishReason::kSTOP_WORDS; std::map done; std::for_each(requestIds.begin(), requestIds.end(), [&done](auto id) { done[id] = false; }); int iter = 0; while (!(std::all_of(done.begin(), done.end(), [](auto x) { return x.second; })) && iter < 5000) { std::chrono::milliseconds waitTime(mMaxWaitMs); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { if (response.hasError()) { FAIL(); } else { auto const reqId = response.getRequestId(); auto const& result = response.getResult(); EXPECT_TRUE(result.isFinal); done.at(reqId) = result.isFinal; // only 1 beam auto const& outputIds = result.outputTokenIds.at(0); EXPECT_EQ(outputIds.size(), expectedNewTokens.at(reqId)) << "req " << reqId; auto const& finishReason = result.finishReasons.at(0); EXPECT_EQ(finishReason, expectedFinishReason.at(reqId)) << "req " << reqId; auto const& genLogits = result.generationLogits; EXPECT_TRUE(genLogits.has_value()); // Expected shape: (1, numAcceptedDraftToken, vocabSizePadded) auto const& generationLogitsShape = genLogits->getShape(); EXPECT_EQ(generationLogitsShape.size(), 3); EXPECT_EQ(generationLogitsShape[0], 1); EXPECT_LE(generationLogitsShape[1], maxNewTokens); EXPECT_EQ(generationLogitsShape[2], vocabSizePadded); auto const genLogitsTensor = detail::toITensor(*genLogits); genLogitsTensor->squeeze(0); // only 1 beam for (size_t outputIdx = 0; outputIdx < expectedNewTokens.at(reqId); ++outputIdx) { // logits argmax should be equal to tokenId auto const genLogitsSlice = tr::ITensor::slice(genLogitsTensor, outputIdx, 1); auto const genLogitsRange = tr::BufferRange(*genLogitsSlice); auto const* maxPos = std::max_element(genLogitsRange.begin(), genLogitsRange.end()); auto const maxIdx = std::distance(genLogitsRange.begin(), maxPos); auto const tokenId = outputIds.at(outputIdx); // Observed token mismatch at index 2 after building GPT engine with TRT builder optimization // level 3. The testcase is sensitive to slight variation in kernel computation, so we skip checking // for token id at index 2. if (outputIdx != 2) { EXPECT_EQ(tokenId, maxIdx) << "req " << reqId << " outputIdx " << outputIdx; } } } } ++iter; } } TEST_F(GptExecutorTest, GenerationChangeEndId) { SizeType32 constexpr beamWidth{2}; SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded auto constexpr streaming = false; ExtendedRuntimePerfKnobConfig perfKnobConfig = ExtendedRuntimePerfKnobConfig(); perfKnobConfig.setEnableContextFMHAFP32Acc(true); // use fmha fp32 acc for better accuracy // Create executor config auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setExtendedRuntimePerfKnobConfig(perfKnobConfig); // Create executor auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); auto const inputPath = DATA_PATH / "input_tokens.npy"; ModelIds modelIds{50256, 50256}; auto manager = tr::BufferManager(std::make_shared()); auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU); auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId); auto const* const givenInputData = tr::bufferCast(*givenInput); auto const& inputShape = givenInput->getShape(); ASSERT_EQ(inputShape.nbDims, 2); ASSERT_GT(inputShape.d[0], 0); BeamResult beamResult{beamWidth}; auto const resultsPath = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth)); beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_CONTEXTFMHAFP32ACC_RESULT_FILE(); // Just return tokens for check OutputConfig outConfig; outConfig.excludeInputFromOutput = true; // Load expected outputs for each beam width value auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds); auto const maxSeqLen = testData.maxSeqLen; // Load expected outputs and inputs std::vector requests; std::vector reqMaxNewTokens; // Only use the first request to test auto constexpr reqIdx = 0; SizeType32 inputLen = givenInputLengths.at(reqIdx); auto maxNewTokens = maxSeqLen - maxInputLength; reqMaxNewTokens.push_back(maxNewTokens); auto const* const seqBegin = givenInputData + reqIdx * maxInputLength; // Use customized `EndId` to enqueue once auto request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId); TokenIdType customizedEndId = *(seqBegin + 1); // Use a token appeared in ground-truth request.setEndId(customizedEndId); requests.emplace_back(std::move(request)); auto requestIds = executor.enqueueRequests(requests); std::chrono::milliseconds waitTime(mMaxWaitMs); auto responses = executor.awaitResponses(waitTime); if (responses.at(0).hasError()) { FAIL(); } requests.clear(); // Change back to default `EndId` to enqueue again, and check the output request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId); auto const expectedOutputData = tr::BufferRange(*testData.expectedOutputIds); auto const expectedOutputLengths = testData.expectedOutputLengths; auto const endPos = expectedOutputLengths[reqIdx]; auto const endIndex = tc::flat_index3(reqIdx, beamWidth, endPos, beamWidth, maxSeqLen); auto const endToken = expectedOutputData[endIndex]; request.setEndId(endToken); requests.emplace_back(std::move(request)); requestIds = executor.enqueueRequests(requests); auto const requestId = requestIds.at(0); std::map expectedNewTokens; expectedNewTokens[requestId] = endPos - inputLen; std::map expectedFinishReason; expectedFinishReason[requestId] = FinishReason::kLENGTH; std::map done; std::for_each(requestIds.begin(), requestIds.end(), [&done](auto id) { done[id] = false; }); int iter = 0; while (!(std::all_of(done.begin(), done.end(), [](auto x) { return x.second; })) && iter < 5000) { std::chrono::milliseconds waitTime(mMaxWaitMs); auto responses = executor.awaitResponses(waitTime); auto& response = responses.at(0); if (response.hasError()) { FAIL(); } else { auto const reqId = response.getRequestId(); auto const& result = response.getResult(); EXPECT_TRUE(result.isFinal); done.at(reqId) = result.isFinal; bool anyMismatch = false; for (int i = 0; i < result.outputTokenIds.size(); ++i) { auto const& outputIds = result.outputTokenIds.at(i); EXPECT_EQ(outputIds.size(), expectedNewTokens.at(reqId)) << "req " << reqId; anyMismatch |= outputIds.size() != expectedNewTokens.at(reqId); auto const& finishReason = result.finishReasons.at(i); EXPECT_EQ(finishReason, expectedFinishReason.at(reqId)) << "req " << reqId; anyMismatch |= finishReason != expectedFinishReason.at(reqId); if (anyMismatch) { break; } for (int j = 0; j < outputIds.size(); ++j) { auto const resultToken = outputIds[j]; auto const groundTruthToken = expectedOutputData[maxSeqLen * i + inputLen + j]; EXPECT_EQ(resultToken, groundTruthToken); anyMismatch |= resultToken != groundTruthToken; } } EXPECT_FALSE(anyMismatch); } ++iter; } } // stream, excludeInputFromOutput, beamWidth using ParamType = std::tuple; // useOrchestratorMode, beamWidth, modelName using ParamCancelReqType = std::tuple; // modelName using LeaderApiUsageType = std::tuple; // iterStatsMaxIterations, useOrchestratorMode using ParamStatsType = std::tuple; // streaming, beamWidth, computeLogProbs, excludeInputInOutput, returnContextLogits, returnGenerationLogits, modelName, // useOrchestratorMode, returnAllGeneratedTokens, numReturnSequences using AllParamsType = std::tuple; // modelName, batched, replicated using LogitsProcParamsType = std::tuple; // modelName using GuidedDecodingParamsType = std::tuple; // modelName, useOrchestratorMode, beamWidth using TimeoutTestParamsType = std::tuple; std::string generateTestName(testing::TestParamInfo const& info) { auto const streaming = std::get<0>(info.param); auto const excludeInputFromOutput = std::get<1>(info.param); auto const beamWidth = std::get<2>(info.param); std::string name = "ExecutorTest"; if (streaming) { name += "Streaming"; } if (excludeInputFromOutput) { name += "ExclInput"; } name.append("BW" + std::to_string(beamWidth)); return name; } std::string generateTestNameCancelReq(testing::TestParamInfo const& info) { auto const& useOrchestratorMode = std::get<0>(info.param); auto const beamWidth = std::get<1>(info.param); auto const modelName = std::get<2>(info.param); std::string name = "ExecutorTest"; name.append("BW" + std::to_string(beamWidth)); name.append("_" + modelName + "_"); if (useOrchestratorMode) { name.append("OrchMode"); } else { name.append("LeaderMode"); } return name; } std::string generateTestNameLeaderApiUsage(testing::TestParamInfo const& info) { auto const modelName = std::get<0>(info.param); std::string name = "ExecutorTest"; name.append("_" + modelName); return name; } std::string generateTestNameLogitsProc(testing::TestParamInfo const& info) { auto const modelName = std::get<0>(info.param); bool const batched = std::get<1>(info.param); bool const replicated = std::get<2>(info.param); std::string name = "ExecutorTest"; name.append("_" + modelName); if (batched) { name.append("_Batched"); } if (replicated) { name.append("_Replicated"); } return name; } std::string generateTestNameGuidedDecoding(testing::TestParamInfo const& info) { auto const modelName = std::get<0>(info.param); std::string name = "ExecutorTest"; name.append("_" + modelName); return name; } std::string generateTestNameTimeoutTest(testing::TestParamInfo const& info) { auto const modelName = std::get<0>(info.param); auto const& useOrchestratorMode = std::get<1>(info.param); auto const beamWidth = std::get<2>(info.param); std::string name = "ExecutorTest"; name.append("_" + modelName); if (useOrchestratorMode) { name.append("_OrchMode"); } else { name.append("_LeaderMode"); } name.append("_BW" + std::to_string(beamWidth)); return name; } std::string generateTestNameStats(testing::TestParamInfo const& info) { int iterStatsMaxIterations = std::get<0>(info.param); auto const& useOrchestratorMode = std::get<1>(info.param); std::string name = "ExecutorTest_"; name.append(std::to_string(iterStatsMaxIterations) + "_"); if (useOrchestratorMode) { name.append("OrchMode"); } else { name.append("LeaderMode"); } return name; } std::string generateTestNameAllParams(testing::TestParamInfo const& info) { auto const streaming = std::get<0>(info.param); auto const& beamWidth = std::get<1>(info.param); auto const& computeLogProbs = std::get<2>(info.param); auto const& excludeInputInOutput = std::get<3>(info.param); auto const& returnContextLogits = std::get<4>(info.param); auto const& returnGenerationLogits = std::get<5>(info.param); auto const modelName = std::get<6>(info.param); auto const& useOrchestratorMode = std::get<7>(info.param); auto const& returnAllGeneratedTokens = std::get<8>(info.param); auto const& numReturnSequences = std::get<9>(info.param); std::string name = "ExecutorTest_"; if (streaming) { name += "Streaming"; } name.append("_BW" + std::to_string(beamWidth)); name.append("Nseq" + std::to_string(numReturnSequences)); if (computeLogProbs) { name.append("LogProbs"); } if (excludeInputInOutput) { name.append("ExcludeInput"); } if (returnContextLogits) { name.append("ContextLogits"); } if (returnGenerationLogits) { name.append("GenerationLogits"); } name.append("_" + modelName + "_"); if (useOrchestratorMode) { name.append("OrchMode"); } else { name.append("LeaderMode"); } if (returnAllGeneratedTokens) { name.append("returnAllGeneratedTokens"); } return name; } class ParamTest : public GptExecutorTest, public ::testing::WithParamInterface { }; class ParamStatsTest : public GptExecutorTest, public ::testing::WithParamInterface { }; class AllParamsTest : public GptExecutorTest, public ::testing::WithParamInterface { }; class ParamCancelReqTest : public GptExecutorTest, public ::testing::WithParamInterface { }; class LeaderApiUsageTest : public GptExecutorTest, public ::testing::WithParamInterface { }; class LogitsProcParamsTest : public GptExecutorTest, public ::testing::WithParamInterface { }; class GuidedDecodingParamsTest : public GptExecutorTest, public ::testing::WithParamInterface { }; class TimeoutTest : public GptExecutorTest, public ::testing::WithParamInterface { }; TEST_F(GptExecutorTest, GetLatestStats) { bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the request SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); auto requestId = executor.enqueueRequest(std::move(request)); bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { FAIL(); } else { done = response.getResult().isFinal; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); // Expect 6 non-empty iterations auto stats = executor.getLatestIterationStats(); EXPECT_EQ(stats.size(), 6); uint64_t currentIter = 0; for (auto const& stat : stats) { EXPECT_EQ(stat.timestamp.size(), 26); EXPECT_EQ(stat.iter, currentIter); if (currentIter != 5) { EXPECT_EQ(stat.numActiveRequests, 1); } else { // For the last iteration the number of active requests // should be zero. EXPECT_EQ(stat.numActiveRequests, 0); } EXPECT_EQ(stat.maxNumActiveRequests, 64); // Very loose check to make sure the memory stats are valid EXPECT_GT(stat.gpuMemUsage, 16); EXPECT_GT(stat.cpuMemUsage, 16); EXPECT_GT(stat.pinnedMemUsage, 16); // Stats for KV cache EXPECT_TRUE(stat.kvCacheStats.has_value()); KvCacheStats const& kvStats = stat.kvCacheStats.value(); EXPECT_GT(kvStats.maxNumBlocks, 0); EXPECT_GT(kvStats.freeNumBlocks, 0); EXPECT_EQ(kvStats.usedNumBlocks, currentIter == maxNewTokens ? 0 : 1); EXPECT_GT(kvStats.tokensPerBlock, 0); EXPECT_GT(kvStats.allocTotalBlocks, 0); EXPECT_GT(kvStats.allocNewBlocks, 0); EXPECT_GE(kvStats.reusedBlocks, 0); EXPECT_GE(kvStats.missedBlocks, 0); EXPECT_GE(kvStats.cacheHitRate, 0); // Stats for inflight batching EXPECT_TRUE(stat.inflightBatchingStats.has_value() && !stat.staticBatchingStats.has_value()); InflightBatchingStats const& modelStats = stat.inflightBatchingStats.value(); EXPECT_EQ(modelStats.numScheduledRequests, currentIter == maxNewTokens ? 0 : 1); EXPECT_EQ(modelStats.numContextRequests, currentIter == 0 ? 1 : 0); EXPECT_EQ(modelStats.numGenRequests, currentIter == 0 || currentIter == maxNewTokens ? 0 : 1); EXPECT_EQ(modelStats.numPausedRequests, 0); EXPECT_EQ(modelStats.numCtxTokens, currentIter == 0 ? inputTokens.size() : 0); EXPECT_EQ(modelStats.microBatchId, 0); EXPECT_NEAR( modelStats.avgNumDecodedTokensPerIter, currentIter == 0 || currentIter == maxNewTokens ? 0.f : 1.f, 1e-9f); auto jsonStr = JsonSerialization::toJsonStr(stat); EXPECT_THAT(jsonStr, testing::HasSubstr("\"iter\":" + std::to_string(currentIter))); EXPECT_THAT(jsonStr, testing::HasSubstr("\"staticBatchingStats\":null")); EXPECT_THAT(jsonStr, testing::HasSubstr("\"numCtxTokens\":" + std::to_string(modelStats.numCtxTokens))); EXPECT_THAT(jsonStr, testing::HasSubstr("\"numGenRequests\":" + std::to_string(modelStats.numGenRequests))); ++currentIter; } } TEST_F(GptExecutorTest, GetLatestStatsWithMultipleRequests) { bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the requests SizeType32 const numRequests = 2; std::vector maxNewTokens{3, 5}; std::vector inputTokens{{1, 2, 3, 4}, {5, 6, 7}}; std::vector reqIds; for (SizeType32 ireq = 0; ireq < numRequests; ++ireq) { auto request = Request(inputTokens[ireq], maxNewTokens[ireq], streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); auto requestId = executor.enqueueRequest(std::move(request)); reqIds.emplace_back(requestId); // sleep for 10 ms before sending the next request std::this_thread::sleep_for(std::chrono::milliseconds(20)); } for (SizeType32 ireq = 0; ireq < numRequests; ++ireq) { auto requestId = reqIds[ireq]; bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { FAIL(); } else { done = response.getResult().isFinal; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); } // NOTES: // Expect at least max(maxNewTokens) i.e. 5 non-empty iterations // 4th iteration should have numCompletedRequests to be 1. // Depending on the timing, first iteration will either have: // 2 active requests // or // 1 active requests and 1 queued requests auto stats = executor.getLatestIterationStats(); EXPECT_GT(stats.size(), 0); // make sure we have at least 1 stat before the accessing 0-th element if (stats[0].numActiveRequests == 2) { // we cannot reliably check queue latency since both started in the same iteration // there should be exactly 5 non-empty iterations EXPECT_EQ(stats.size(), 5); // only check numCompletedRequests in 4th iteration EXPECT_EQ(stats[3].numCompletedRequests, 1); // 1st iteration shall record all 2 requests queueing time; EXPECT_EQ(stats[0].numNewActiveRequests, 2); // all rest iterations shall not return any queueing time; for (int i = 1; i < stats.size(); ++i) { EXPECT_EQ(stats[i].numNewActiveRequests, 0); } } else { // there should be more than 5 non-empty iterations since 2nd request started after 1st iteration EXPECT_GT(stats.size(), 5); // 1st request's completion is at 4th iteration EXPECT_EQ(stats[3].numCompletedRequests, 1); // 1st iteration record 1 request's queueing time; EXPECT_EQ(stats[0].numNewActiveRequests, 1); // the iteration where 2nd request became active, queue latency must be > 0 uint64_t currentIter = 0; for (auto const& stat : stats) { // To check when 2nd request becomes active, we need to think about 2 cases: // - it overlaps with first request // => only check queue time in this case // - it doesn't overlap with the first request (e.g. 1st request ended too fast) // => little to no queue time, cannot check reliably // so we only check for queue time when numActiveRequests > 1 i.e. overlap happened after first iteration if (stat.numActiveRequests > 1) { EXPECT_GT(currentIter, 0); // it must be after 1st iteration EXPECT_GT(stat.newActiveRequestsQueueLatencyMS, 0); // 2nd request record queueing time in this iteration EXPECT_EQ(stat.numNewActiveRequests, 1); break; } ++currentIter; } } } TEST_F(GptExecutorTest, GetLatestRequestStats) { bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setRequestStatsMaxIterations(1000); executorConfig.setEnableChunkedContext(true); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the requests std::vector> requestParams = { // {maxNewTokens, inputTokens} {5, {1, 2, 3, 4}}, {4, {1, 1, 2, 3, 5}}, {1, {1}}, {8, VecTokens(383, 1)} // Long enough to be chunked into multiple iterations }; std::vector requests; for (auto requestParam : requestParams) { requests.emplace_back(requestParam.second, requestParam.first, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); } auto requestIdsVec = executor.enqueueRequests(std::move(requests)); std::map requestIdToIndex; std::set activeRequests; for (SizeType32 i = 0; i < requestIdsVec.size(); ++i) { auto requestId = requestIdsVec[i]; activeRequests.insert(requestId); requestIdToIndex[requestId] = i; } int iter = 0; while (!activeRequests.empty() && iter < mMaxWaitMs) { for (auto i = activeRequests.begin(); i != activeRequests.end();) { auto requestId = *i; bool thisDone = false; std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } else { thisDone = response.getResult().isFinal; } } if (thisDone) { // Erase completed request and move to the next one i = activeRequests.erase(i); } else { ++i; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); // Expect 5 non-empty iterations // Note: The 6th iteration with the last finished request will be reported // but might be unavailable when getLatestRequestStats is called since // it could be updated after the final response has been sent. auto stats = executor.getLatestRequestStats(); EXPECT_GE(stats.size(), 5); SizeType32 currentIter = 0; auto invalidStart = std::numeric_limits::max(); std::vector genStart(requestParams.size(), invalidStart); // The iteration index when generation started std::set completedRequests; for (auto stat = stats.begin(); stat != stats.begin() + 5; ++stat) { auto jsonStrIter = JsonSerialization::toJsonStr(*stat); EXPECT_EQ(stat->iter, currentIter); EXPECT_THAT(jsonStrIter, testing::HasSubstr("\"iter\":" + std::to_string(currentIter))); EXPECT_EQ(stat->requestStats.size() + completedRequests.size(), requestParams.size()); for (auto rStat : stat->requestStats) { auto jsonStr = JsonSerialization::toJsonStr(rStat); // Only a few requests here so all of them should be scheduled. A separate test // GetLatestRequestStatsScheduling will target the scheduling stats. if (rStat.stage != RequestStage::kGENERATION_COMPLETE) { EXPECT_TRUE(rStat.scheduled); EXPECT_THAT(jsonStr, testing::HasSubstr("\"scheduled\":true")); } EXPECT_TRUE(!rStat.paused); EXPECT_THAT(jsonStr, testing::HasSubstr("\"paused\":false")); EXPECT_TRUE(requestIdToIndex.count(rStat.id)); EXPECT_THAT(jsonStr, testing::HasSubstr("\"id\":" + std::to_string(rStat.id))); auto requestIndex = requestIdToIndex[rStat.id]; auto contextSize = requestParams[requestIndex].second.size(); if (rStat.contextPrefillPosition == contextSize) // Check generation phase { bool firstIteration{false}; // Context phase is done EXPECT_TRUE(rStat.stage == RequestStage::kGENERATION_IN_PROGRESS || rStat.stage == RequestStage::kGENERATION_COMPLETE); EXPECT_THAT(jsonStr, testing::HasSubstr("\"stage\":\"GENERATION")); if (genStart[requestIndex] == invalidStart) { // Just started generation genStart[requestIndex] = currentIter; firstIteration = true; } // One token per iteration EXPECT_TRUE(currentIter - genStart[requestIndex] == rStat.numGeneratedTokens); EXPECT_NEAR(rStat.avgNumDecodedTokensPerIter, firstIteration ? 0.f : 1.0f, 1e-9); if (rStat.stage == RequestStage::kGENERATION_COMPLETE) { EXPECT_TRUE(requestParams[requestIndex].first >= rStat.numGeneratedTokens); completedRequests.insert(requestIndex); } else { EXPECT_FALSE(completedRequests.count(requestIndex)); } } else if (rStat.contextPrefillPosition < contextSize) // Check context phase { // Must be chunked SizeType32 const maxChunkSize = 128; EXPECT_TRUE(rStat.contextPrefillPosition % maxChunkSize == 0); // Context phase is on-going EXPECT_TRUE(rStat.stage == RequestStage::kCONTEXT_IN_PROGRESS); // No tokens are generated EXPECT_TRUE(0 == rStat.numGeneratedTokens); } else { FAIL() << "Out-of-boundary contextPrefillPosition in stats: " << rStat.contextPrefillPosition << " out of " << contextSize; } // Sanity check that disaggregated serving stats is not set in typical use case EXPECT_FALSE(rStat.disServingStats.has_value()); } ++currentIter; } // We should have visited all requests. // Take into consideration the last request has not been reported EXPECT_EQ(completedRequests.size() + 1, requestParams.size()); } TEST_F(GptExecutorTest, GetLatestRequestStatsScheduling) { // Specifically test the case where there are too many requests to be scheduled for a iteration bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setRequestStatsMaxIterations(1000); executorConfig.setEnableChunkedContext(true); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create 100 requests. Note the max batch size for this model is 64 so some requests won't be scheduled right away. std::vector> requestParams(100, {5, {1, 2, 3, 4}}); std::vector requests; requests.reserve(requestParams.size()); for (auto requestParam : requestParams) { requests.emplace_back(requestParam.second, requestParam.first, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); } auto requestIdsVec = executor.enqueueRequests(std::move(requests)); std::map requestIdToIndex; std::set activeRequests; for (SizeType32 i = 0; i < requestIdsVec.size(); ++i) { auto requestId = requestIdsVec[i]; activeRequests.insert(requestId); requestIdToIndex[requestId] = i; } int iter = 0; while (!activeRequests.empty() && iter < mMaxWaitMs) { for (auto i = activeRequests.begin(); i != activeRequests.end();) { auto requestId = *i; bool thisDone = false; std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } else { thisDone = response.getResult().isFinal; } } if (thisDone) { // Erase completed request and move to the next one i = activeRequests.erase(i); } else { ++i; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); auto stats = executor.getLatestRequestStats(); SizeType32 numFinished = 0; SizeType32 const maxActiveSize = 64; // Decided by the model // The 6th iteration request stat may or may not be available when getLatestRequestStats // is called. When there are no other active or inTransmission requests, there will be // another request stats to properly reset all the statistics to zero. for (auto stat = stats.begin(); stat != stats.begin() + 5; ++stat) { SizeType32 numReqs = 0; SizeType32 numReqsActive = 0; SizeType32 numReqsQueued = 0; SizeType32 numReqsJustDone = 0; for (auto rStat : stat->requestStats) { ++numReqs; numReqsActive += rStat.scheduled ? 1 : 0; numReqsQueued += rStat.stage == RequestStage::kQUEUED ? 1 : 0; numReqsJustDone += rStat.stage == RequestStage::kGENERATION_COMPLETE ? 1 : 0; } EXPECT_EQ(numReqs, numReqsActive + numReqsQueued + numReqsJustDone); EXPECT_EQ(numReqs + numFinished, requestParams.size()); // Should report all unfinished requests EXPECT_TRUE(numReqsActive <= maxActiveSize); // Not all requests are active due to max active size limit. numFinished += numReqsJustDone; } } TEST_F(GptExecutorTest, GetRequestStatsMultipleRequests) { bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setRequestStatsMaxIterations(1000); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); auto sendRequestWaitForResponseFn = [&]() { Request request({1, 2, 3}, 5); auto requestId = executor.enqueueRequest(request); bool isFinalResponse = false; while (!isFinalResponse) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto response : responses) { if (response.getResult().isFinal) { isFinalResponse = true; break; } } } return requestId; }; std::unordered_map requestIdToGenerationComplete; auto updateStats = [&]() { auto stats = executor.getLatestRequestStats(); for (auto& stat : stats) { for (auto const& request : stat.requestStats) { // only check and aggregate results when request is completed if (request.stage == RequestStage::kGENERATION_COMPLETE) { requestIdToGenerationComplete[request.id] += 1; } } } }; auto requestId = sendRequestWaitForResponseFn(); requestIdToGenerationComplete[requestId] = 0; updateStats(); requestId = sendRequestWaitForResponseFn(); requestIdToGenerationComplete[requestId] = 0; updateStats(); for (auto [key, value] : requestIdToGenerationComplete) { EXPECT_EQ(value, 1); } } TEST_F(GptExecutorTest, BatchSizeTuning) { bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setRequestStatsMaxIterations(1000); executorConfig.setEnableChunkedContext(true); DynamicBatchConfig dynamicBatchConfig(true, false, 1); // Set window size to 1 SchedulerConfig schedulerConfig(CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::nullopt, dynamicBatchConfig); executorConfig.setSchedulerConfig(schedulerConfig); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); std::vector tunerRecommendedBatchSizes; for (size_t i = 0; i <= 8; ++i) { auto inputLength = 1 << i; // Note that for this model max input len is 383 Request request( VecTokens(inputLength, 2), 5, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); auto requestId = executor.enqueueRequest(std::move(request)); // Wait for current request to finish while (true) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); bool done = false; if (responses.size() != 0) { EXPECT_TRUE(responses.size() == 1); auto response = responses[0]; EXPECT_FALSE(response.hasError()); if (response.getResult().isFinal) { break; } } } auto reqStats = executor.getLatestIterationStats(); EXPECT_TRUE(reqStats.size() > 0); auto lastStat = reqStats.back(); tunerRecommendedBatchSizes.push_back(lastStat.maxBatchSizeTunerRecommended); } EXPECT_TRUE(tunerRecommendedBatchSizes.size() > 0); // It's supposed to be decreasing when input length increases EXPECT_TRUE(*tunerRecommendedBatchSizes.begin() > *tunerRecommendedBatchSizes.rbegin()); } TEST_F(GptExecutorTest, GetLatestDebugTensors) { bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 maxNewTokens = 5; tensorrt_llm::executor::DebugConfig debugConfig; debugConfig.setDebugTensorNames({{"sequence_length"}}); debugConfig.setDebugTensorsMaxIterations(maxNewTokens); SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setDebugConfig(debugConfig); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the request VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); auto requestId = executor.enqueueRequest(request); bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { FAIL(); } else { done = response.getResult().isFinal; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); auto stream = std::make_shared(); // Expect 5 non-empty iterations auto debugTensors = executor.getLatestDebugTensors(); EXPECT_EQ(debugTensors.size(), 5); uint64_t currentIter = 0; for (auto const& debugIteration : debugTensors) { EXPECT_EQ(debugIteration.iter, currentIter); EXPECT_EQ(debugIteration.debugTensors.size(), 2); { auto it = debugIteration.debugTensors.find("request_ids"); EXPECT_NE(it, debugIteration.debugTensors.end()); auto const& tensor = it->second; auto const& shape = tensor.getShape(); EXPECT_EQ(shape.size(), 1); EXPECT_EQ(shape[0], 1); EXPECT_EQ(tensor.getSize(), 1); auto const* dataPtr = static_cast(tensor.getData()); EXPECT_EQ(dataPtr[0], 1) << "currentIter " << currentIter; } { auto it = debugIteration.debugTensors.find("sequence_length"); EXPECT_NE(it, debugIteration.debugTensors.end()); auto const& tensor = it->second; auto const& shape = tensor.getShape(); EXPECT_EQ(shape.size(), 1); EXPECT_EQ(tensor.getSize(), 1); auto tensorHost = tensor.copyToCpu(stream); auto const* dataPtr = static_cast(tensorHost.getData()); EXPECT_EQ(dataPtr[0], inputTokens.size() + currentIter); } ++currentIter; } } TEST_P(ParamTest, SingleRequestDemo) { bool const streaming = std::get<0>(GetParam()); bool const excludeInputFromOutput = std::get<1>(GetParam()); auto const beamWidth = std::get<2>(GetParam()); OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the request SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); // Enqueue the request auto requestId = executor.enqueueRequest(request); // Get the new tokens VecTokens tokens; SizeType32 numResponses{0}; bool done = false; int iter = 0; std::chrono::milliseconds waitTime(1); while (!done && iter < mMaxWaitMs) { auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { ++numResponses; if (response.hasError()) { // This request failed for some reason, get error msg std::string errStr = "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg(); FAIL(); } auto result = response.getResult(); done = result.isFinal; auto& newTokens = result.outputTokenIds.at(beamWidth - 1); auto const expectedSize = streaming ? (beamWidth > 1 ? numResponses : 1) : (maxNewTokens + (excludeInputFromOutput ? 0 : inputTokens.size())); EXPECT_EQ(newTokens.size(), expectedSize); if (streaming && beamWidth > 1) { // replace tokens tokens = newTokens; } else { // Append tokens tokens.insert(tokens.end(), newTokens.begin(), newTokens.end()); } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); EXPECT_EQ(numResponses, streaming ? maxNewTokens : 1); EXPECT_EQ( tokens.size(), streaming ? maxNewTokens : (excludeInputFromOutput ? 0 : inputTokens.size()) + maxNewTokens); // Expect awaitResponse to return error message because the request is already terminated (isFinal = True) auto response = executor.awaitResponses(requestId, waitTime).at(0); EXPECT_TRUE(response.hasError()); std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } TEST_P(ParamTest, MultipleRequestDemo) { bool const streaming = std::get<0>(GetParam()); bool const excludeInputFromOutput = std::get<1>(GetParam()); auto const beamWidth = std::get<2>(GetParam()); OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 numRequests = 20; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 maxPromptLen = 20; SizeType32 maxMaxNewTokens = 20; SizeType32 endId = -1; // Enqueue the requests std::unordered_map tokens; std::unordered_map expectedNumTokens; std::unordered_map expectedNumResponses; for (SizeType32 req = 0; req < numRequests; ++req) { SizeType32 promptLen = rand() % maxPromptLen + 1; SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1; auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId); auto reqId = executor.enqueueRequest(std::move(request)); tokens[reqId] = {}; expectedNumTokens[reqId] = ((streaming || excludeInputFromOutput) ? 0 : promptLen) + maxNewTokens; expectedNumResponses[reqId] = streaming ? maxNewTokens : 1; } // Get the new tokens for each requests int32_t numFinished = 0; int iter = 0; std::unordered_map numResponses; while (numFinished < numRequests && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { auto reqId = response.getRequestId(); ++numResponses[reqId]; if (!response.hasError()) { auto result = response.getResult(); numFinished += result.isFinal; auto& newTokens = result.outputTokenIds.at(beamWidth - 1); auto const expectedSize = streaming ? (beamWidth > 1 ? numResponses[reqId] : 1) : expectedNumTokens[reqId]; EXPECT_EQ(newTokens.size(), expectedSize); auto& reqTokens = tokens.at(response.getRequestId()); if (streaming && beamWidth > 1) { reqTokens = newTokens; } else { reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end()); } for (SizeType32 b = 0; b < beamWidth; ++b) { EXPECT_EQ(result.finishReasons.at(b), result.isFinal ? FinishReason::kLENGTH : FinishReason::kNOT_FINISHED); } } else { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); // Check that number of tokens matches expectations for (auto const& [reqId, numTokens] : expectedNumTokens) { EXPECT_EQ(expectedNumResponses[reqId], numResponses[reqId]) << "reqId " << reqId; EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId; } } TEST_P(ParamStatsTest, MultipleRequestStats) { bool streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 numRequests = 100; auto iterStatsMaxIterations = std::get<0>(GetParam()); bool useOrchestratorMode = std::get<1>(GetParam()); SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setIterStatsMaxIterations(iterStatsMaxIterations); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; std::optional orchestratorConfig = std::nullopt; if (useOrchestratorMode) { orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); } auto parallelConfig = ParallelConfig(CommunicationType::kMPI, useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 maxPromptLen = 20; SizeType32 maxMaxNewTokens = 20; SizeType32 endId = -1; // Enqueue the requests std::unordered_map tokens; std::unordered_map expectedNumTokens; for (SizeType32 req = 0; req < numRequests; ++req) { SizeType32 promptLen = rand() % maxPromptLen + 1; SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1; auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId); auto reqId = executor.enqueueRequest(std::move(request)); tokens[reqId] = {}; expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens; } std::atomic statsThreadDone = false; std::atomic numFinished = 0; std::deque iterStatsReceived; // Spawn a thread that continuously get stats auto statsThread = std::thread( [&executor, &numFinished, numRequests, &iterStatsReceived, &statsThreadDone]() { while (numFinished < numRequests) { auto reqStats = executor.getLatestIterationStats(); iterStatsReceived.insert(iterStatsReceived.end(), std::make_move_iterator(reqStats.begin()), std::make_move_iterator(reqStats.end())); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } statsThreadDone = true; }); // Get the new tokens for each requests int iter = 0; SizeType32 numResponses = 0; while (numFinished < numRequests && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { numResponses++; if (!response.hasError()) { auto result = response.getResult(); numFinished += result.isFinal; auto& newTokens = result.outputTokenIds.at(beamWidth - 1); auto& reqTokens = tokens.at(response.getRequestId()); reqTokens.insert(reqTokens.end(), std::make_move_iterator(newTokens.begin()), std::make_move_iterator(newTokens.end())); } else { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); // Check that number of tokens matches expectations for (auto const& [reqId, numTokens] : expectedNumTokens) { EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId; } // Wait for stats thread to be done, fail otherwise iter = 0; while (!statsThreadDone && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); std::this_thread::sleep_for(std::chrono::milliseconds(waitTime)); iter++; } ASSERT_TRUE(statsThreadDone); if (iterStatsMaxIterations > 0) { ASSERT_GT(iterStatsReceived.size(), 1); for (auto stats : iterStatsReceived) { EXPECT_GT(stats.numActiveRequests, 0); TLLM_LOG_INFO("%d %d", stats.iter, stats.numActiveRequests); EXPECT_TRUE(stats.inflightBatchingStats.has_value()); if (stats.inflightBatchingStats.has_value()) { EXPECT_GT(stats.inflightBatchingStats.value().numScheduledRequests, 0); } } } statsThread.join(); } TEST_P(ParamTest, MultipleRequestBatchResponses) { bool const streaming = std::get<0>(GetParam()); bool const excludeInputFromOutput = std::get<1>(GetParam()); auto const beamWidth = std::get<2>(GetParam()); OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 constexpr numRequests{20}; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 constexpr maxPromptLen{20}; SizeType32 constexpr maxMaxNewTokens{20}; SizeType32 endId = -1; // Enqueue the requests std::unordered_map tokens; std::unordered_map expectedNumTokens; std::vector requestIds; for (SizeType32 req = 0; req < numRequests; ++req) { SizeType32 promptLen = rand() % maxPromptLen + 1; SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1; auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId); auto reqId = executor.enqueueRequest(std::move(request)); requestIds.push_back(reqId); tokens[reqId] = {}; expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens; } // Get the new tokens for each requests int32_t numFinished = 0; int iter = 0; SizeType32 numResponses = 0; std::chrono::milliseconds waitTime(1); while (numFinished < numRequests && iter < mMaxWaitMs) { auto idResponses = executor.awaitResponses(requestIds, waitTime); for (unsigned i = 0; i < requestIds.size(); ++i) { auto& responses = idResponses[i]; for (auto& response : responses) { numResponses++; if (!response.hasError()) { auto result = response.getResult(); numFinished += result.isFinal; auto& newTokens = result.outputTokenIds.at(beamWidth - 1); auto& reqTokens = tokens.at(response.getRequestId()); if (streaming && beamWidth > 1) { reqTokens = newTokens; } else { reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end()); } } else { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); // Rerun awaitResponses again and we expect to only see terminated request id error. auto idResponses = executor.awaitResponses(requestIds, waitTime); for (auto const& responses : idResponses) { for (auto& response : responses) { EXPECT_TRUE(response.hasError()); std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } // Check that number of tokens matches expectations for (auto const& [reqId, numTokens] : expectedNumTokens) { EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId; } } TEST_P(ParamTest, GetNumResponsesReadyTest) { bool const streaming = std::get<0>(GetParam()); bool const excludeInputFromOutput = std::get<1>(GetParam()); auto const beamWidth = std::get<2>(GetParam()); OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 maxNumRequests = 50; SizeType32 maxPromptLen = 20; SizeType32 maxMaxNewTokens = 20; SizeType32 numRequests = rand() % maxNumRequests + 1; SizeType32 numExpectedResponses = 0; std::map reqNumExpectedResponses; std::vector ids; for (SizeType32 req = 0; req < numRequests; ++req) { SizeType32 promptLen = rand() % maxPromptLen + 1; SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1; auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); auto id = executor.enqueueRequest(std::move(request)); ids.emplace_back(id); reqNumExpectedResponses[id] = streaming ? maxNewTokens : 1; numExpectedResponses += reqNumExpectedResponses.at(id); } SizeType32 iter = 0; SizeType32 numReady = 0; while (numReady < numExpectedResponses && iter < mMaxWaitMs) { numReady = 0; for (auto id : ids) { numReady += executor.getNumResponsesReady(id); } std::this_thread::sleep_for(std::chrono::milliseconds(1)); ++iter; } EXPECT_LT(iter, mMaxWaitMs); // Expect one response per request for (auto id : ids) { SizeType32 numReady = executor.getNumResponsesReady(id); EXPECT_EQ(numReady, reqNumExpectedResponses.at(id)); } auto numResponsesReady = executor.getNumResponsesReady(); EXPECT_EQ(numResponsesReady, numExpectedResponses); } namespace { void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& modelIds, FlakyTestInfo const& flakyTestInfo, bool streaming, SizeType32 const vocabSizePadded, BeamResult const& beamResult, OutputConfig const& outConfig, bool isSpeculativeDecoding, int maxWaitMs, bool returnAllGeneratedTokens, SizeType32 const numReturnSequences, bool isNonGreedySampling, SizeType32 const modelParallelism) { auto const beamWidth = beamResult.beamWidth; auto manager = tr::BufferManager(std::make_shared()); auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU); auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId); auto const* const givenInputData = tr::bufferCast(*givenInput); auto const& inputShape = givenInput->getShape(); ASSERT_EQ(inputShape.nbDims, 2); ASSERT_GT(inputShape.d[0], 0); // Load expected outputs for each beam width value auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds); auto const maxSeqLen = testData.maxSeqLen; // Load expected outputs and inputs SizeType32 numRequests = static_cast(givenInputLengths.size()); SizeType32 maxRequests = numRequests; std::vector requests; std::vector reqMaxNewTokens; auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth); // top-k will be set by a large number to test non-identical N sequences. if (isNonGreedySampling) { samplingConfig.setTopK(32); } samplingConfig.setNumReturnSequences(numReturnSequences); for (SizeType32 req = 0; req < maxRequests; ++req) { SizeType32 inputLen = givenInputLengths.at(req); auto maxNewTokens = maxSeqLen - maxInputLength; reqMaxNewTokens.push_back(maxNewTokens); SizeType32 endId = -1; auto const* const seqBegin = givenInputData + req * maxInputLength; VecTokens tokens(seqBegin, seqBegin + inputLen); auto request = Request( VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming, samplingConfig, outConfig, endId); request.setReturnAllGeneratedTokens(returnAllGeneratedTokens); requests.emplace_back(std::move(request)); } auto& comm = tensorrt_llm::mpi::MpiComm::world(); auto const worldRank = comm.getRank(); // Expected return sizes. auto const numSequences = beamWidth > 1 ? 1 : numReturnSequences; auto const numReturnBeams = std::min(beamWidth, numReturnSequences); if (worldRank == 0) { auto const reqIds = executor.enqueueRequests(requests); std::unordered_map> tokens; std::unordered_map reqIdToBatchId; for (SizeType32 req = 0; req < reqIds.size(); ++req) { std::vector resultTokens(numSequences, BeamTokens(numReturnBeams)); tokens[req] = std::move(resultTokens); reqIdToBatchId[reqIds.at(req)] = req; } // Get the new tokens for each requests int32_t numFinished = 0; int iter = 0; std::unordered_map numResponses; while (numFinished < maxRequests && iter < maxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { auto batchId = reqIdToBatchId.at(response.getRequestId()); numResponses[batchId]++; if (!response.hasError()) { auto result = response.getResult(); numFinished += result.isFinal; auto seqIdx = result.sequenceIndex; auto const& contextLogits = result.contextLogits; auto const& genLogits = result.generationLogits; auto const& outputTokenIds = result.outputTokenIds; EXPECT_EQ(result.finishReasons.size(), numReturnBeams); for (SizeType32 beam = 0; beam < numReturnBeams; ++beam) { auto const& newTokens = outputTokenIds.at(beam); auto& reqTokens = tokens.at(batchId).at(seqIdx).at(beam); if (!returnAllGeneratedTokens) { reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end()); } else { EXPECT_EQ(newTokens.size(), (numResponses.at(batchId) + numReturnSequences - 1) / numReturnSequences); reqTokens = newTokens; } // FinishReason is only supported for bw=1 and inflight batching. if (beamWidth == 1) { EXPECT_EQ(result.finishReasons.at(beam), result.isSequenceFinal ? FinishReason::kLENGTH : FinishReason::kNOT_FINISHED); } } auto const& cumLogProbs = result.cumLogProbs; auto const& logProbs = result.logProbs; auto const& beamTokens = tokens.at(batchId).at(seqIdx); EXPECT_EQ(beamTokens.size(), numReturnBeams); if (!isNonGreedySampling) { float const logitsAtol = modelParallelism > 1 ? 1e-1 : 1e-2; float const logitsRtol = modelParallelism > 1 ? 1e-2 : 1e-3; testData.verifyLogProbs(outConfig.returnLogProbs, streaming, outConfig.excludeInputFromOutput, givenInputLengths.at(batchId), beamWidth, beamTokens, cumLogProbs, logProbs, batchId, flakyTestInfo); testData.validateContextLogits(outConfig.returnContextLogits, givenInputLengths.at(batchId), beamWidth, contextLogits, vocabSizePadded, batchId, logitsAtol, logitsRtol); testData.validateGenerationLogits(outConfig.returnGenerationLogits, result.isSequenceFinal, streaming, outConfig.excludeInputFromOutput, givenInputLengths.at(batchId), reqMaxNewTokens.at(batchId), beamWidth, beamTokens, genLogits, vocabSizePadded, batchId, returnAllGeneratedTokens, logitsAtol, logitsRtol); } // Ignore first iteration as it doesn't use draft tokens if (outConfig.returnPerfMetrics && isSpeculativeDecoding && result.requestPerfMetrics.value().iter > 0) { auto& specDecMetrics = result.requestPerfMetrics.value().speculativeDecoding; // 4 draft tokens are used per step EXPECT_EQ(specDecMetrics.totalDraftTokens, result.requestPerfMetrics.value().iter.value() * 4); EXPECT_EQ(specDecMetrics.acceptanceRate, static_cast(specDecMetrics.totalAcceptedDraftTokens) / specDecMetrics.totalDraftTokens); } } else { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } ++iter; } EXPECT_LT(iter, maxWaitMs); testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo, isSpeculativeDecoding, beamWidth, numSequences, isNonGreedySampling); } } void runTest(fs::path const& modelPath, ExecutorConfig const& executorConfig, fs::path const& inputPath, ModelIds const& modelIds, FlakyTestInfo const& flakyTestInfo, bool streaming, SizeType32 const vocabSizePadded, BeamResult const& beamResult, OutputConfig const& outConfig, bool isSpeculativeDecoding, int maxWaitMs, bool returnAllGeneratedTokens, SizeType32 const numReturnSequences, bool isNonGreedySampling, SizeType32 const modelParallelism) { auto executor = Executor{modelPath, ModelType::kDECODER_ONLY, executorConfig}; runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig, isSpeculativeDecoding, maxWaitMs, returnAllGeneratedTokens, numReturnSequences, isNonGreedySampling, modelParallelism); } ExecutorConfig createExecutorConfig(SizeType32 maxBeamWidth, bool useOrchestratorMode, bool gatherGenerationLogits, std::optional> deviceIds = std::nullopt, std::optional> participantIds = std::nullopt) { // Note: we reduce memory fraction for cases that return context/generation logits which require more free // memory FloatType constexpr freeGpuMemoryFraction{0.5F}; KvCacheConfig kvCacheConfig(false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction); auto executorConfig = ExecutorConfig(maxBeamWidth); executorConfig.setKvCacheConfig(kvCacheConfig); executorConfig.setNormalizeLogProbs(false); executorConfig.setGatherGenerationLogits(gatherGenerationLogits); std::optional orchestratorConfig = std::nullopt; if (useOrchestratorMode) { orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); } auto parallelConfig = ParallelConfig(CommunicationType::kMPI, useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::move(deviceIds), std::move(participantIds), orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); return executorConfig; } } // namespace TEST_P(AllParamsTest, TokenComparison) { auto const streaming = std::get<0>(GetParam()); auto const& beamWidth = std::get<1>(GetParam()); OutputConfig outConfig; outConfig.returnLogProbs = std::get<2>(GetParam()); outConfig.excludeInputFromOutput = std::get<3>(GetParam()); outConfig.returnContextLogits = std::get<4>(GetParam()); outConfig.returnGenerationLogits = std::get<5>(GetParam()); auto const modelName = std::get<6>(GetParam()); auto const useOrchestratorMode = std::get<7>(GetParam()); auto const returnAllGeneratedTokens = std::get<8>(GetParam()); auto const numReturnSequences = std::get<9>(GetParam()); if (returnAllGeneratedTokens && !streaming) { GTEST_SKIP() << "Test does not support returnAllGeneratedTokens without streaming"; } std::optional> participantIds = std::nullopt; BeamResult beamResult{beamWidth}; ASSERT_TRUE(fs::exists(DATA_PATH)); fs::path modelPath; // set defaults and adjust if needed by different models fs::path inputPath = DATA_PATH / "input_tokens.npy"; ModelIds modelIds{50256, 50256}; bool isSpeculativeDecoding{false}; SizeType32 vocabSizePadded = 50257; // NOTE: This can be used to disable checks for certain prompt batch entries FlakyTestInfo flakyTestInfo; if (modelName == "gpt") { auto const resultsPath = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth)); if (outConfig.returnContextLogits || outConfig.returnGenerationLogits) { modelPath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu"; beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE(); beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE(); beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE(); if (outConfig.returnLogProbs) { beamResult.cumLogProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_CUM_LOG_PROBS_FILE(); beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_LOG_PROBS_FILE(); } } else { modelPath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_FILE(); if (outConfig.returnLogProbs) { beamResult.cumLogProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CUM_LOG_PROBS_FILE(); beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_LOG_PROBS_FILE(); } } } else if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1" || modelName == "llama_tp1_pp2_cp1") { inputPath = DATA_PATH / LLAMA_INPUT_FILE; modelIds.padId = LLAMA_PAD_ID; modelIds.endId = LLAMA_END_ID; vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED; auto const resultsPath = LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth)); if (modelName == "llama_tp4_pp1_cp1") { beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP4_PP1_FILE(); modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu"; } else if (modelName == "llama_tp1_pp4_cp1") { beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP4_FILE(); modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu"; } else if (modelName == "llama_tp1_pp2_cp1") { beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP2_FILE(); modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp2-cp1-gpu"; } else if (modelName == "llama_tp2_pp2_cp1") { beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP2_PP2_FILE(); modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu"; } beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_TP4_PP1_FILE(); if (outConfig.returnLogProbs) { beamResult.cumLogProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CUM_LOG_PROBS_TP4_PP1_FILE(); beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_LOG_PROBS_TP4_PP1_FILE(); } } else if (modelName == "medusa") { TLLM_CHECK_WITH_INFO(beamWidth == 1, "Medusa does not support beam search."); auto const resultsPath = MEDUSA_DATA_PATH / "sampling"; auto modelSpec = ModelSpec::getDefaultModelSpec() .useMedusa() .setInputFile("input_tokens_long.npy") .setMaxOutputLength(128); beamResult.resultsFile = resultsPath / modelSpec.getResultsFile(); modelPath = MEDUSA_MODEL_PATH / modelSpec.getModelPath() / "tp1-pp1-cp1-gpu"; inputPath = DATA_PATH / "input_vicuna.npy"; modelIds.padId = 2; modelIds.endId = 2; isSpeculativeDecoding = true; outConfig.returnPerfMetrics = true; } else if (modelName == "chatglm" || modelName == "chatglm2" || modelName == "chatglm3" || modelName == "glm") { fs::path resultsPath; if (modelName == "chatglm") { resultsPath = CHATGLM_DATA_PATH; modelPath = CHATGLM_MODEL_PATH; } else if (modelName == "chatglm2") { resultsPath = CHATGLM2_DATA_PATH; modelPath = CHATGLM2_MODEL_PATH; } else if (modelName == "chatglm3") { resultsPath = CHATGLM3_DATA_PATH; modelPath = CHATGLM3_MODEL_PATH; } else if (modelName == "glm") { resultsPath = GLM_DATA_PATH; modelPath = GLM_MODEL_PATH; } resultsPath /= (beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth); beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_FILE(); modelPath = modelPath / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; char versionChatglm{0}; if (size_t index = modelPath.string().find("chatglm"); index != std::string::npos) { versionChatglm = modelPath.string()[index + 7]; std::string const vChatglmString = (versionChatglm == '-') ? std::string("") : std::string(1, versionChatglm); inputPath = DATA_PATH / ("input_tokens_chatglm" + vChatglmString + "-6b.npy"); modelIds.padId = (versionChatglm == '-') ? 3 : 0; modelIds.endId = (versionChatglm == '-') ? 130005 : 2; } else if (size_t index = modelPath.string().find("glm-10b"); index != std::string::npos) { inputPath = DATA_PATH / "input_tokens_glm-10b.npy"; modelIds.padId = 50256; modelIds.endId = 50258; } if (versionChatglm != 0) { flakyTestInfo.batchIdBeams.insert(std::make_pair(1, 0)); } } else { TLLM_THROW("Unrecognized modelName"); } if (streaming && beamWidth > 1) { GTEST_SKIP() << "Test does not support streaming with beam search"; } // Warning: This should be the last check before running the test. // It will initialize MPI which can take significant time. if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1" || modelName == "llama_tp1_pp2_cp1") { // For llama model, only run for multiple GPUs // This is detected by setting an env variable when running the test char const* val = getenv("RUN_LLAMA_MULTI_GPU"); if (val == nullptr) { GTEST_SKIP() << "Skipping Llama test"; } if (outConfig.returnContextLogits) { GTEST_SKIP() << "Skipping context logits tests for mpi runs"; } // Check that it was launched with right number of MPI ranks if (!useOrchestratorMode && COMM_SESSION.getSize() != 4) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Leader mode and world size is not equal to 4"; } if (useOrchestratorMode && COMM_SESSION.getSize() != 1) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Orchestrator mode and World size is not equal to 1"; } } auto decoderJsonConfig = tensorrt_llm::runtime::GptJsonConfig::parse(modelPath / "config.json"); auto const modelTP = decoderJsonConfig.getTensorParallelism(); auto const modelPP = decoderJsonConfig.getPipelineParallelism(); auto const modelParallelism = modelTP * modelPP; int deviceCount = -1; TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); std::optional> deviceIds = std::vector(modelParallelism); for (auto i = 0; i < deviceIds->size(); i++) { deviceIds->at(i) = i % deviceCount; } if (modelName == "llama_tp1_pp2_cp1") { auto const& session = tensorrt_llm::mpi::MpiComm::world(); if (session.getSize() != 4) { FAIL() << "Llama-tp1-pp2 is intended solely for testing coexisting engines within the same MPI world," " which requires a session size of 4. However, the current session size is " << session.getSize() << " ."; } if (session.getRank() / 2 == 0) { participantIds = std::vector{0, 1}; deviceIds = std::vector{0, 1}; } else { participantIds = std::vector{2, 3}; deviceIds = std::vector{2, 3}; } } if (modelPP > 1) { std::reverse(deviceIds->begin(), deviceIds->end()); if (modelTP > 1) { for (SizeType32 ppRank = 0; ppRank < modelPP; ppRank++) { std::reverse(deviceIds->begin() + ppRank * modelTP, deviceIds->begin() + (ppRank + 1) * modelPP); } } } // Returning logits will bring higher latency if (streaming && (outConfig.returnContextLogits || outConfig.returnGenerationLogits)) { mMaxWaitMs = 20000; } auto executorConfig = createExecutorConfig(beamWidth, useOrchestratorMode, outConfig.returnGenerationLogits, std::move(deviceIds), std::move(participantIds)); runTest(modelPath, executorConfig, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig, isSpeculativeDecoding, mMaxWaitMs, returnAllGeneratedTokens, numReturnSequences, false, modelParallelism); } TEST_F(GptExecutorTest, ChangeBeamWidth) { SizeType32 constexpr maxBeamWidth{2}; auto executorConfig = ExecutorConfig(maxBeamWidth); auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 constexpr beamWidth1{1}; SizeType32 constexpr beamWidth2{2}; SizeType32 constexpr maxNewTokens{2}; VecTokens inputTokens{1, 2, 3, 4}; // Create requests with different beam widths std::vector requests; requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1)); requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1)); requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth2)); requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1)); auto requestIds = executor.enqueueRequests(requests); int numFinished = 0; int iter = 0; while (numFinished < 4 && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { if (response.hasError()) { auto err = response.getErrorMsg(); std::cout << "err:" << err << std::endl; FAIL() << "Should not get a response with error"; } else { auto result = response.getResult(); numFinished += static_cast(result.isFinal); } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); auto stats = executor.getLatestIterationStats(); uint64_t currentIter = 0; for (auto const& stat : stats) { // TODO: enable this check when stats are cleaned // EXPECT_EQ(stat.iter, currentIter); if (stat.iter < 2) { // req 1 and 2 run with same beam width EXPECT_EQ(stat.numActiveRequests, 2); } else if (stat.numActiveRequests != 0) // TODO: remove this check when stats are cleaned { // req 3 or 4 run width different beam width EXPECT_EQ(stat.numActiveRequests, 1); } ++currentIter; } } void doTokenComparisonChangeBeamWidth(bool enableReuse, SizeType32 maxWaitMs) { SizeType32 constexpr maxBeamWidth{2}; SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded auto constexpr streaming = false; // Create executor config auto kvCacheConfig = KvCacheConfig(enableReuse); auto executorConfig = ExecutorConfig(maxBeamWidth, SchedulerConfig(), kvCacheConfig); // Create executor auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); auto const inputPath = DATA_PATH / "input_tokens.npy"; ModelIds modelIds{50256, 50256}; OutputConfig outConfig; FlakyTestInfo flakyTestInfo; bool constexpr isSpeculativeDecoding{false}; for (SizeType32 beamWidth : {1, 2}) { BeamResult beamResult{beamWidth}; auto const resultsPath = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth)); beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE(); beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE(); beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE(); runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig, isSpeculativeDecoding, maxWaitMs, false, 1, false, 1); } } TEST_F(GptExecutorTest, TokenComparisonChangeBeamWidth) { doTokenComparisonChangeBeamWidth(false, mMaxWaitMs); } TEST_F(GptExecutorTest, TokenComparisonChangeBeamWidthBlockReuse) { doTokenComparisonChangeBeamWidth(true, mMaxWaitMs); } TEST_F(GptExecutorTest, NReturnRandomness) { SizeType32 constexpr maxBeamWidth{1}; SizeType32 constexpr numReturnSequences{2}; SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded auto constexpr streaming = false; // Create executor config auto executorConfig = ExecutorConfig(maxBeamWidth); // Create executor auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); auto const inputPath = DATA_PATH / "input_tokens.npy"; ModelIds modelIds{50256, 50256}; OutputConfig outConfig; FlakyTestInfo flakyTestInfo; bool constexpr isSpeculativeDecoding{false}; BeamResult beamResult{maxBeamWidth}; auto const resultsPath = GPT_DATA_PATH / "sampling"; beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE(); beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE(); beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE(); runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig, isSpeculativeDecoding, mMaxWaitMs, false, 1, true, 1); } TEST_F(GptExecutorTest, TimedOut) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // No requests enqueued, expect no responses auto numResponsesReady = executor.getNumResponsesReady(); EXPECT_EQ(numResponsesReady, 0); std::chrono::milliseconds waitTime(10); auto responses = executor.awaitResponses(waitTime); EXPECT_EQ(responses.size(), 0); } TEST_F(GptExecutorTest, MaxSeqIdleMicrosecondsError) { auto executorConfig = ExecutorConfig(1); // Request will time out executorConfig.setMaxSeqIdleMicroseconds(1); auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 constexpr maxNewTokens{5}; VecTokens inputTokens{1, 2, 3, 4}; std::vector requests; requests.emplace_back(inputTokens, maxNewTokens, false); auto requestIds = executor.enqueueRequests(requests); bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { if (response.hasError()) { auto err = response.getErrorMsg(); std::cout << "err:" << err << std::endl; EXPECT_THAT(err, testing::HasSubstr("Unable to get batch slot for reqId")); done = true; } else { FAIL() << "Should get a response with error"; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); } void logitsProcessorMixedReqsTest(std::string const& modelDir, SizeType32 worldRank, SizeType32 maxWaitMs, bool replicated, std::optional> deviceIds); TEST_P(LogitsProcParamsTest, All) { auto const modelName = std::get<0>(GetParam()); auto const batched = std::get<1>(GetParam()); auto const replicated = std::get<2>(GetParam()); std::string modelDir; int tp_size = 1, pp_size = 1, cp_size = 1; std::optional> deviceIds = std::nullopt; if (modelName == "llama_tp1_pp1_cp1") { modelDir = "tp1-pp1-cp1-gpu"; } else if (modelName == "llama_tp4_pp1_cp1") { modelDir = "tp4-pp1-cp1-gpu"; tp_size = 4; } else if (modelName == "llama_tp1_pp4_cp1") { modelDir = "tp1-pp4-cp1-gpu"; pp_size = 4; deviceIds = std::vector{3, 2, 1, 0}; } else if (modelName == "llama_tp2_pp2_cp1") { modelDir = "tp2-pp2-cp1-gpu"; tp_size = pp_size = 2; deviceIds = std::vector{2, 3, 0, 1}; } else { TLLM_THROW("Unrecognized modelName"); } std::filesystem::path modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / modelDir; auto& comm = tensorrt_llm::mpi::MpiComm::world(); auto const worldRank = comm.getRank(); auto const worldSize = comm.getSize(); if (tp_size * pp_size * cp_size != 1) { // Run multi GPU test only when env variable is set char const* val = getenv("RUN_LLAMA_MULTI_GPU"); if (val == NULL) { GTEST_SKIP() << "Skipping multi-gpu logits post processor test"; } if (worldSize != 4) { FAIL() << "Leader mode and world size is not equal to 4"; } } else { // This has no effect for single-GPU tests if (replicated) { GTEST_SKIP() << "Skipping single-gpu replicated logits post processor test"; } } // Configuration options bool const streaming = false; bool excludeInputFromOutput = false; OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; SizeType32 numRequests = 20; IdType const kClientId = 1234; SizeType32 beamWidth = 1; SizeType32 maxPromptLen = 20; SizeType32 maxMaxNewTokens = 20; SizeType32 constexpr endId{2}; SizeType32 constexpr vocabSizePadded{32000}; // llama-7b vocabSizePadded // We just use tokenIdCalculator to generate a token_id based on request index, output position and max new tokens. // Then LogitsPostProcessor set all other logits except the generated token_id to large negative value. // So the output token should be the generated token by tokenIdCalculator. auto tokenIdCalculator = [endId, vocabSizePadded](IdType req, SizeType32 pos) { SizeType32 tokenId = (req * 1000 + pos) % vocabSizePadded; if (tokenId == endId) { tokenId = 0; } return tokenId; }; std::unordered_map tokens; std::unordered_map expectedNumTokens; std::unordered_map expectedOutputTokens; // Enqueue the requests auto enqueueRequests = [&](Executor& executor, std::optional logitsProcessorName, std::optional logitsProcessor = std::nullopt) { tokens.clear(); expectedNumTokens.clear(); expectedOutputTokens.clear(); for (SizeType32 req = 0; req < numRequests; ++req) { SizeType32 promptLen = rand() % maxPromptLen + 1; SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1; auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId); request.setClientId(kClientId); if (logitsProcessorName) { request.setLogitsPostProcessorName(logitsProcessorName.value()); } else if (logitsProcessor) { request.setLogitsPostProcessor(logitsProcessor.value()); } auto reqId = executor.enqueueRequest(std::move(request)); tokens[reqId] = {}; expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens; expectedOutputTokens[reqId] = {}; if (!streaming && !excludeInputFromOutput) { expectedOutputTokens[reqId].resize(promptLen, 1); } for (SizeType32 outputPos = 0; outputPos < maxNewTokens; ++outputPos) { SizeType32 outputTokenId = tokenIdCalculator(reqId, outputPos + promptLen); expectedOutputTokens[reqId].push_back(outputTokenId); } } }; // Get the new tokens for each requests auto collectResponses = [&](Executor& executor) { int32_t numFinished = 0; int iter = 0; SizeType32 numResponses = 0; while (numFinished < numRequests && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { numResponses++; if (!response.hasError()) { EXPECT_EQ(response.getClientId().value(), kClientId); auto result = response.getResult(); numFinished += result.isFinal; auto& newTokens = result.outputTokenIds.at(beamWidth - 1); auto& reqTokens = tokens.at(response.getRequestId()); reqTokens.insert(reqTokens.end(), std::make_move_iterator(newTokens.begin()), std::make_move_iterator(newTokens.end())); } else { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); }; // Check that tokens matches expectations auto checkOutput = [&]() { for (auto const& [reqId, numTokens] : expectedNumTokens) { EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId; for (SizeType32 tokenPos = 0; tokenPos < std::min(expectedNumTokens[reqId], tokens[reqId].size()); ++tokenPos) { EXPECT_EQ(expectedOutputTokens[reqId][tokenPos], tokens[reqId][tokenPos]) << "reqId=" << reqId << ", tokenPos=" << tokenPos; } } }; // Test non-batched logits processor std::string const logitsProcessorName = "SelectToken"; auto logitsPostProcessorFn = [&](IdType reqId, Tensor& logits, BeamTokens const& tokens, StreamPtr const& streamPtr, std::optional clientId) { if (replicated) { EXPECT_TRUE(worldRank <= tp_size - 1); } else { EXPECT_TRUE(worldRank == 0); } EXPECT_TRUE(clientId.value() == kClientId); SizeType32 numTokens = tokens.at(0).size(); SizeType32 pos = numTokens; SizeType32 outputTokenId = tokenIdCalculator(reqId, pos); auto logitsDataType = logits.getDataType(); EXPECT_TRUE(logitsDataType == DataType::kFP16 || logitsDataType == DataType::kBF16 || logitsDataType == DataType::kFP32); // logits has shape [draftLength + 1, reqBeamWidth, vocabSize] auto logitsCpu = tensorrt_llm::executor::Tensor::cpu(logitsDataType, logits.getShape()); auto* dataPtr = logitsCpu.getData(); auto eltSize = logitsCpu.getSizeInBytes() / logitsCpu.getSize(); EXPECT_TRUE(eltSize == 2 || eltSize == 4); if (eltSize == 2) { auto* dataPtrU16 = static_cast(dataPtr); uint16_t hugeNegValue = logitsDataType == DataType::kFP16 ? 0xFBFF : 0xFF7F; // a huge negative value for (size_t i = 0; i < logitsCpu.getSize(); ++i) { dataPtrU16[i] = hugeNegValue; } dataPtrU16[outputTokenId] = 0; } else { auto* dataPtrFloat = static_cast(dataPtr); for (size_t i = 0; i < logitsCpu.getSize(); ++i) { dataPtrFloat[i] = -HUGE_VALF; } dataPtrFloat[outputTokenId] = 0.0f; } logits.setFrom(logitsCpu, streamPtr); }; if (!batched) { auto executorConfig = ExecutorConfig(beamWidth); LogitsPostProcessorConfig logitsProcConfig{ std::unordered_map{ {logitsProcessorName, logitsPostProcessorFn}}, std::nullopt, replicated}; executorConfig.setLogitsPostProcessorConfig(logitsProcConfig); if (deviceIds.has_value()) { auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); executorConfig.setParallelConfig(parallelConfig); } auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig); if (worldRank == 0) { enqueueRequests(executor, logitsProcessorName); collectResponses(executor); checkOutput(); if (!replicated || tp_size == 1) { // Dynamic logits postprocessor must be used with replicate=false or no tensor parallelism. enqueueRequests(executor, std::nullopt, logitsPostProcessorFn); collectResponses(executor); checkOutput(); } } } // Test batched logits processor auto logitsPostProcessorBatchedFn = [logitsPostProcessorFn](std::vector const& reqIdBatch, std::vector& logitsBatch, std::vector> const& tokensBatch, StreamPtr const& streamPtr, std::vector> const& clientIdBatch) { for (int sample = 0; sample < reqIdBatch.size(); sample++) { logitsPostProcessorFn( reqIdBatch[sample], logitsBatch[sample], tokensBatch[sample], streamPtr, clientIdBatch[sample]); } }; if (batched) { auto batchedExecutorConfig = ExecutorConfig(beamWidth); if (deviceIds.has_value()) { auto parallelConfig = batchedExecutorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); batchedExecutorConfig.setParallelConfig(parallelConfig); } LogitsPostProcessorConfig logitsProcConfig{std::nullopt, logitsPostProcessorBatchedFn, replicated}; batchedExecutorConfig.setLogitsPostProcessorConfig(logitsProcConfig); auto batchedExecutor = Executor(modelPath, ModelType::kDECODER_ONLY, batchedExecutorConfig); if (worldRank == 0) { enqueueRequests(batchedExecutor, Request::kBatchedPostProcessorName); collectResponses(batchedExecutor); checkOutput(); } } if (!batched) { logitsProcessorMixedReqsTest(modelDir, worldRank, mMaxWaitMs, replicated, std::move(deviceIds)); } } // Test for mixing requests with and without logits processor. void logitsProcessorMixedReqsTest(std::string const& modelDir, SizeType32 worldRank, SizeType32 maxWaitMs, bool replicated, std::optional> deviceIds) { std::string const logitsProcessorName = "dummy"; auto logitsPostProcessorFn = [&](IdType reqId, Tensor& logits, BeamTokens const& tokens, StreamPtr const& streamPtr, std::optional clientId) { // Dummy callback that does not modify logits assert(!clientId.has_value()); }; LogitsPostProcessorConfig logitsProcConfig{ std::unordered_map{ {logitsProcessorName, logitsPostProcessorFn}}, std::nullopt, replicated}; // Create executor SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); executorConfig.setLogitsPostProcessorConfig(logitsProcConfig); if (deviceIds.has_value()) { auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); executorConfig.setParallelConfig(parallelConfig); } std::filesystem::path modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / modelDir; auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig); if (worldRank == 0) { SizeType32 numRequests = 2; SizeType32 promptLen = 5; // First request with no LP and many output tokens auto request1 = Request(VecTokens(promptLen, 1), 25); // Second request with LP and few output tokens auto request2 = Request(VecTokens(promptLen, 1), 5); request2.setLogitsPostProcessorName(logitsProcessorName); // Enqueue requests auto reqId1 = executor.enqueueRequest(request1); auto reqId2 = executor.enqueueRequest(request2); // Wait for responses int32_t numFinished = 0; int iter = 0; SizeType32 numResponses = 0; while (numFinished < numRequests && iter < maxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { numResponses++; if (!response.hasError()) { auto result = response.getResult(); numFinished += result.isFinal; } else { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated."; EXPECT_EQ(response.getErrorMsg(), err); } } ++iter; } EXPECT_LT(iter, maxWaitMs); } } TEST_F(GptExecutorTest, LogitsPostProcessorThrow) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); std::string const logitsProcessorName = "UnExistProcessor"; auto request = Request(VecTokens(10, 1), 10, false, tensorrt_llm::executor::SamplingConfig(beamWidth), OutputConfig()); request.setLogitsPostProcessorName(logitsProcessorName); EXPECT_THROW({ auto reqId = executor.enqueueRequest(std::move(request)); }, tensorrt_llm::common::TllmException); } static Response executeDraftRequest(Executor& executor) { OutputConfig outputConfig; outputConfig.returnGenerationLogits = true; // Create the request SizeType32 maxNewTokens = 4; VecTokens inputTokens{1, 2, 3, 4}; Request request{std::move(inputTokens), maxNewTokens}; request.setOutputConfig(outputConfig); // Enqueue the request auto requestId = executor.enqueueRequest(std::move(request)); // Wait for the response auto responses = executor.awaitResponses(requestId); return responses.at(0); } static Response executeTargetRequest(Executor& executor, Result const& draftResult) { // Create the request SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4}; Request request{std::move(inputTokens), maxNewTokens}; VecTokens const& outputTokenIds = draftResult.outputTokenIds.at(0); VecTokens draftTokens(outputTokenIds.end() - 4, outputTokenIds.end()); auto const& logitsInfo = draftResult.specDecFastLogitsInfo.value(); auto logitsTensor = logitsInfo.toTensor(); ExternalDraftTokensConfig draftTokensConfig( std::move(draftTokens), logitsTensor, std::nullopt /* acceptance threshold */, true /* fastLogits */); request.setExternalDraftTokensConfig(draftTokensConfig); // Enqueue the request auto requestId = executor.enqueueRequest(std::move(request)); // Wait for the response auto responses = executor.awaitResponses(requestId); return responses.at(0); } class SpeculativeDecodingTest : public GptExecutorTest { }; TEST_F(SpeculativeDecodingTest, SpecDecFastLogits) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtDraftEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu"; auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DRAFT_TOKENS_DIR() / "tp1-pp1-cp1-gpu"; FloatType freeGpuMemoryFraction = 0.3; auto kvCacheConfig = KvCacheConfig(true /* enableBlockReuse */, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction); executorConfig.setKvCacheConfig(kvCacheConfig); tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE); int const worldSize = tensorrt_llm::mpi::MpiComm::world().getSize(); ASSERT_EQ(worldSize, 3); int const myRank = tensorrt_llm::mpi::MpiComm::world().getRank(); bool const isOrchestrator = (myRank == 0); auto orchestratorConfig = OrchestratorConfig(isOrchestrator, "" /* workerExecutablePath */, nullptr, false /* spawnPrcesses */); auto parallelConfig = ParallelConfig( CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); auto specDecConfig = SpeculativeDecodingConfig(true /* fastLogits */); executorConfig.setSpecDecConfig(specDecConfig); std::unique_ptr draftExecutor; std::unique_ptr targetExecutor; if (isOrchestrator) { auto executorConfigDraft = executorConfig; parallelConfig.setParticipantIds({1}); executorConfigDraft.setParallelConfig(parallelConfig); draftExecutor = std::make_unique(trtDraftEnginePath, ModelType::kDECODER_ONLY, executorConfigDraft); parallelConfig.setParticipantIds({2}); executorConfig.setParallelConfig(parallelConfig); targetExecutor = std::make_unique(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); } else if (myRank == 1) // draft model process { parallelConfig.setParticipantIds({1}); parallelConfig.setDeviceIds({0}); executorConfig.setParallelConfig(parallelConfig); executorConfig.setGatherGenerationLogits(true); draftExecutor = std::make_unique(trtDraftEnginePath, ModelType::kDECODER_ONLY, executorConfig); } else if (myRank == 2) // target model process { parallelConfig.setParticipantIds({2}); parallelConfig.setDeviceIds({0}); executorConfig.setParallelConfig(parallelConfig); draftExecutor = std::make_unique(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); } if (isOrchestrator) { auto response = executeDraftRequest(*draftExecutor); ASSERT_FALSE(response.hasError()); response = executeTargetRequest(*targetExecutor, response.getResult()); ASSERT_FALSE(response.hasError()); } } TEST_F(GptExecutorTest, OrchestratorMaxQueueSize) { auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; SizeType32 maxQueueSize = 6; ExecutorConfig executorConfig; executorConfig.setMaxQueueSize(maxQueueSize); auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); auto parallelConfig = ParallelConfig( CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the request SizeType32 maxNewTokens = 100; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens); std::vector requestIds; auto numberOfRequests = maxQueueSize * 5; requestIds.reserve(numberOfRequests); // Enqueue more requests than the queue can manage for (int i = 0; i < numberOfRequests; i++) { auto requestId = executor.enqueueRequest(request); requestIds.emplace_back(requestId); } auto responseVectors = executor.awaitResponses(std::move(requestIds)); bool failedWithFullQueue = false; for (auto& responseVector : responseVectors) { for (auto& response : responseVector) { if (response.hasError()) { EXPECT_THAT(response.getErrorMsg(), testing::HasSubstr("Maximum queue size of 6 has been reached, please try again later")); failedWithFullQueue = true; } } } EXPECT_TRUE(failedWithFullQueue) << "Expected requests to fail due to maximum queue size reached"; // Wait for requests to get scheduled to free up space in queue std::this_thread::sleep_for(std::chrono::milliseconds(maxQueueSize * 200)); auto requestId = executor.enqueueRequest(std::move(request)); auto responses = executor.awaitResponses(requestId); for (auto& response : responses) { EXPECT_FALSE(response.hasError()); } } TEST_F(GptExecutorTest, SingleRequestInvalidInputs) { bool streaming = true; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the request SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4}; std::vector expectedErrMsgs; std::vector requests; // Invalid embedding bias shape { requests.emplace_back(inputTokens, maxNewTokens, streaming); auto embeddingBias = Tensor::cpu(DataType::kFP32, {1}); requests.back().setEmbeddingBias(embeddingBias); expectedErrMsgs.emplace_back("embedding bias shape is not as expected"); } for (auto req = 0; req < requests.size(); ++req) { auto& request = requests.at(req); auto const& expectedErrMsg = expectedErrMsgs.at(req); auto requestId = executor.enqueueRequest(std::move(request)); // Try to get the new tokens bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { auto err = response.getErrorMsg(); EXPECT_THAT(err, testing::HasSubstr(expectedErrMsg)); done = true; } else { FAIL() << "Expected an err: " << expectedErrMsg; } } ++iter; } EXPECT_EQ(done, true); } } TEST_F(GptExecutorTest, ExecutorKVCacheManager) { bool streaming = true; int numRequests = 3; SizeType32 beamWidth = 1; SizeType32 maxNewTokens = 5; auto executorConfig = ExecutorConfig(beamWidth); auto kvCacheConfig = KvCacheConfig(true, 128); kvCacheConfig.setEventBufferMaxSize(1024); executorConfig.setKvCacheConfig(kvCacheConfig); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); auto kvCacheManager = *executor.getKVCacheEventManager(); // Created event should be available before any requests. auto events = kvCacheManager->getLatestEvents(std::chrono::seconds(1)); EXPECT_EQ(events.size(), 1); EXPECT_TRUE(std::holds_alternative(events.front().data)); // Create requests std::vector requests; for (int request = 0; request < 3; request++) { VecTokens inputTokens; for (int i = 0; i < 63; i++) { inputTokens.emplace_back(i + request); } requests.emplace_back(inputTokens, maxNewTokens, streaming); } for (auto req = 0; req < requests.size(); ++req) { auto& request = requests.at(req); auto requestId = executor.enqueueRequest(std::move(request)); // Get the new tokens bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { // This request failed for some reason, get error msg std::string errStr = "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg(); FAIL(); } else { auto result = response.getResult(); done = result.isFinal; if (done) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); auto events = kvCacheManager->getLatestEvents(std::chrono::milliseconds(100)); if (req == 0) { EXPECT_EQ(events.size(), 2); // Store the first context block EXPECT_EQ(std::get(events.front().data).parentHash, std::nullopt); EXPECT_EQ(std::get(events.front().data).blocks.size(), 1); // Store the second (now completed) context block and the partial decode block. EXPECT_EQ(std::get(events.back().data).blocks.size(), 2); EXPECT_EQ(std::get(events.front().data).blocks[0].blockHash, std::get(events.back().data).parentHash); } else { EXPECT_EQ(events.size(), 4); // Remove a block to make room for the second context block. On the second request, we need // to remove 2 blocks. EXPECT_EQ(std::get(events.front().data).blockHashes.size(), req); events.pop_front(); // Store the first filled context block EXPECT_EQ(std::get(events.front().data).blocks.size(), 1); events.pop_front(); // Remove a block for the decode phase EXPECT_EQ(std::get(events.front().data).blockHashes.size(), 1); events.pop_front(); // Store the final context block and the decode block EXPECT_EQ(std::get(events.front().data).blocks.size(), 2); } } } } iter++; } EXPECT_EQ(done, true); } } TEST_F(GptExecutorTest, SingleRequestLora) { bool streaming = true; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Load lora weights, config auto manager = tr::BufferManager(std::make_shared()); auto loraWeightsTensor = std::shared_ptr(tr::utils::loadNpy(manager, LORA_WEIGHTS_FILE.string(), tr::MemoryType::kCPU)); auto loraConfigTensor = std::shared_ptr(tr::utils::loadNpy(manager, LORA_CONFIG_FILE.string(), tr::MemoryType::kCPU)); // Create the request SizeType32 maxNewTokens = 5; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig()); auto loraConfig = LoraConfig(0, detail::ofITensor(loraWeightsTensor), detail::ofITensor(loraConfigTensor)); request.setLoraConfig(loraConfig); // Enqueue the request auto requestId = executor.enqueueRequest(std::move(request)); // Get the new tokens VecTokens tokens; bool done = false; int iter = 0; std::chrono::milliseconds waitTime(1); while (!done && iter < mMaxWaitMs) { auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { // This request failed for some reason, get error msg std::string errStr = "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg(); FAIL(); } else { auto result = response.getResult(); done = result.isFinal; // Append tokens auto& newTokens = result.outputTokenIds.at(beamWidth - 1); tokens.insert( tokens.end(), std::make_move_iterator(newTokens.begin()), std::make_move_iterator(newTokens.end())); } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); EXPECT_EQ(tokens.size(), maxNewTokens); } TEST_P(GuidedDecodingParamsTest, All) { auto const modelName = std::get<0>(GetParam()); std::filesystem::path enginePath; std::filesystem::path tokenizerInfoPath; int tp_size = 1, pp_size = 1, cp_size = 1; std::optional> deviceIds = std::nullopt; if (modelName == "gpt") { enginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; tokenizerInfoPath = GPT_XGRAMMAR_TOKENIZER_INFO_PATH; } else if (modelName == "llama_tp1_pp1_cp1") { enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH; } else if (modelName == "llama_tp4_pp1_cp1") { enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu"; tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH; tp_size = 4; } else if (modelName == "llama_tp1_pp4_cp1") { enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu"; tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH; pp_size = 4; deviceIds = std::vector{3, 2, 1, 0}; } else if (modelName == "llama_tp2_pp2_cp1") { enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu"; tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH; tp_size = 2; pp_size = 2; deviceIds = std::vector{2, 3, 0, 1}; } else { TLLM_THROW("Unrecognized modelName"); } auto& comm = tensorrt_llm::mpi::MpiComm::world(); auto const worldRank = comm.getRank(); auto const worldSize = comm.getSize(); if (tp_size * pp_size * cp_size > 1) { // Run multi GPU test only when env variable is set char const* val = getenv("RUN_LLAMA_MULTI_GPU"); if (val == NULL) { GTEST_SKIP() << "Skipping multi-gpu guided decoding test"; } else { if (worldSize != 4) { FAIL() << "Leader mode and world size is not equal to 4"; } } } bool streaming = false; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto const tokenizerInfo = nlohmann::json::parse(std::ifstream{tokenizerInfoPath}); auto const encodedVocab = tokenizerInfo["encoded_vocab"].template get>(); auto const tokenizerStr = tokenizerInfo["tokenizer_str"].template get(); auto const stopTokenIds = tokenizerInfo["stop_token_ids"].template get>(); GuidedDecodingConfig guidedDecodingConfig( GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR, encodedVocab, tokenizerStr, stopTokenIds); executorConfig.setGuidedDecodingConfig(guidedDecodingConfig); if (deviceIds.has_value()) { auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); executorConfig.setParallelConfig(parallelConfig); } auto executor = Executor(enginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the requests VecTokens inputTokens; if (modelName == "gpt") { inputTokens = {2061, 318, 352, 10, 16, 30, 23998, 39559, 287, 257, 8633, 287, 33918, 5794, 25, 220}; } else // llama { inputTokens = { 128000, 62, 3923, 7037, 62, 16, 10, 16, 30, 62, 16533, 87710, 1265, 4404, 5356, 1265, 9643, 9132, 25, 62}; } SizeType32 maxNewTokens = 10; SamplingConfig samplingConfig{}; OutputConfig outputConfig{false, false, false, true}; std::vector requests; requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]); requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON)); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]); std::string jsonSchema{ R"({"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"})"}; requests.back().setGuidedDecodingParams( GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON_SCHEMA, jsonSchema)); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]); std::string regex{R"(\d+)"}; requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kREGEX, regex)); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]); std::string ebnfGrammar{R"(root ::= [0-9]+)"}; requests.back().setGuidedDecodingParams( GuidedDecodingParams(GuidedDecodingParams::GuideType::kEBNF_GRAMMAR, ebnfGrammar)); std::vector expectedOutputTokens; if (modelName == "gpt") { expectedOutputTokens.push_back({1849, 7, 16, 10, 16, 8, 198, 16, 10, 16}); expectedOutputTokens.push_back({90, 366, 3672, 1298, 366, 7554, 31780, 1600, 366, 12888}); expectedOutputTokens.push_back({90, 366, 64, 77, 2032, 68, 81, 1, 1058, 352}); expectedOutputTokens.push_back({25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645}); expectedOutputTokens.push_back({25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645}); } else // llama { expectedOutputTokens.push_back({16, 10, 16, 28, 17, 198, 62, 3923, 7037, 62}); expectedOutputTokens.push_back({5018, 16, 794, 330, 16, 498, 330, 17, 794, 330}); expectedOutputTokens.push_back({5018, 9399, 794, 16, 92}); expectedOutputTokens.push_back({16}); expectedOutputTokens.push_back({16}); } if (executor.canEnqueueRequests()) { // Enqueue the requests auto reqIds = executor.enqueueRequests(std::move(requests)); // Get the responses int numFinished = 0; int iter = 0; while (numFinished < 5 && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { auto reqId = response.getRequestId(); if (response.hasError()) { // This request failed for some reason, get error msg std::string errStr = "Request id " + std::to_string(reqId) + " failed with err " + response.getErrorMsg(); FAIL(); } else { auto result = response.getResult(); auto& newTokens = result.outputTokenIds.at(0); int reqIdx = std::find(reqIds.begin(), reqIds.end(), reqId) - reqIds.begin(); EXPECT_THAT(newTokens, ::testing::ElementsAreArray(expectedOutputTokens[reqIdx])); } numFinished++; } } EXPECT_LT(iter, mMaxWaitMs); EXPECT_EQ(numFinished, 5); } } TEST_F(GptExecutorTest, GuidedDecodingFailure) { bool streaming = false; SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); std::vector stopTokenIds{50256}; auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the requests SizeType32 maxNewTokens = 10; SamplingConfig samplingConfig{}; OutputConfig outputConfig{false, false, false, true}; VecTokens inputTokens{2061, 318, 352, 10, 16, 30, 23998, 39559, 287, 257, 8633, 287, 33918, 5794, 25, 220}; std::vector requests; requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]); requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON)); // Enqueue the requests auto reqIds = executor.enqueueRequests(std::move(requests)); // Get the responses int numFinished = 0; int iter = 0; while (numFinished < 2 && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { auto reqId = response.getRequestId(); int reqIdx = std::find(reqIds.begin(), reqIds.end(), reqId) - reqIds.begin(); if (reqIdx == 0) { EXPECT_FALSE(response.hasError()); } else { EXPECT_TRUE(response.hasError()); } numFinished++; } } EXPECT_LT(iter, mMaxWaitMs); EXPECT_EQ(numFinished, 2); } TEST_P(ParamTest, SingleRequestCancelRequest) { bool const streaming = std::get<0>(GetParam()); bool const excludeInputFromOutput = std::get<1>(GetParam()); auto const beamWidth = std::get<2>(GetParam()); OutputConfig outConfig; outConfig.excludeInputFromOutput = excludeInputFromOutput; auto executorConfig = ExecutorConfig(beamWidth); auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create the request SizeType32 maxNewTokens = 300; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); auto requestId = executor.enqueueRequest(std::move(request)); std::this_thread::sleep_for(std::chrono::milliseconds(100)); executor.cancelRequest(requestId); // Try to get the new tokens bool done = false; int iter = 0; VecTokens tokens; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(requestId, waitTime); for (auto& response : responses) { if (response.hasError()) { FAIL() << "Did not expect errors"; } else { auto result = response.getResult(); done = result.isFinal; // Append tokens auto& newTokens = result.outputTokenIds.at(beamWidth - 1); if (done) { for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx) { EXPECT_EQ(result.finishReasons[beamIdx], FinishReason::kCANCELLED); } } if (streaming && beamWidth > 1) { tokens = newTokens; } else { tokens.insert(tokens.end(), newTokens.begin(), newTokens.end()); } } } ++iter; } EXPECT_EQ(done, true); EXPECT_LT(iter, mMaxWaitMs); auto expectedNumTokens = streaming ? maxNewTokens : (excludeInputFromOutput ? 0 : inputTokens.size()) + maxNewTokens; TLLM_LOG_INFO("num tokens: %d, expected %d", tokens.size(), expectedNumTokens); EXPECT_LT(tokens.size(), expectedNumTokens); } TEST_F(GptExecutorTest, orchModeFetchNewReqErr) { SizeType32 beamWidth = 1; auto executorConfig = ExecutorConfig(beamWidth); auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); auto parallelConfig = ParallelConfig( CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Create a req with invalid parameters SizeType32 maxNewTokens = 5; // Create very long prompt which should result in error during request validate VecTokens inputTokens(10000000); auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth)); auto requestId = executor.enqueueRequest(request); auto requestId2 = executor.enqueueRequest(request); bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { if (response.hasError()) { auto err = response.getErrorMsg(); EXPECT_THAT(err, testing::HasSubstr("exceeds maximum input length")); EXPECT_THAT(err, testing::HasSubstr("Encountered an error when fetching new request:")); done = true; } else { FAIL() << "Should get a response with error"; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); } TEST_F(GptExecutorTest, orchModeForwardError) { SizeType32 constexpr maxBeamWidth{1}; auto executorConfig = ExecutorConfig(maxBeamWidth); auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); auto parallelConfig = ParallelConfig( CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); // Setting request beam width to 2 which should cause failure SizeType32 constexpr beamWidth{2}; SizeType32 constexpr maxNewTokens{5}; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth)); auto requestId = executor.enqueueRequest(request); auto requestId2 = executor.enqueueRequest(request); bool done = false; int iter = 0; while (!done && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { if (response.hasError()) { auto err = response.getErrorMsg(); std::cout << "err:" << err << std::endl; EXPECT_THAT( err, testing::HasSubstr("Requested beam width 2 is larger than configured max beam width 1")); done = true; } else { FAIL() << "Should get a response with error"; } } ++iter; } EXPECT_LT(iter, mMaxWaitMs); } TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest) { auto const useOrchestratorMode = std::get<0>(GetParam()); auto const beamWidth = std::get<1>(GetParam()); auto const modelName = std::get<2>(GetParam()); std::optional> deviceIds = std::nullopt; OutputConfig outConfig; auto executorConfig = ExecutorConfig(beamWidth); std::filesystem::path modelPath; if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1") { if (modelName == "llama_tp4_pp1_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu"; } else if (modelName == "llama_tp1_pp4_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu"; deviceIds = std::vector{3, 2, 1, 0}; } else if (modelName == "llama_tp2_pp2_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu"; deviceIds = std::vector{2, 3, 0, 1}; } } // For llama model, only run for multiple GPUs // This is detected by setting an env variable when running the test char const* val = getenv("RUN_LLAMA_MULTI_GPU"); if (val == NULL) { GTEST_SKIP() << "Skipping Llama test"; } else { // Check that it was launched with right number of MPI ranks if (!useOrchestratorMode && COMM_SESSION.getSize() != 4) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Leader mode and world size is not equal to 4"; } else if (useOrchestratorMode && COMM_SESSION.getSize() != 1) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Orchestrator mode and World size is not equal to 1"; } } if (useOrchestratorMode) { auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); auto parallelConfig = ParallelConfig(CommunicationType::kMPI, useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt, std::nullopt, orchestratorConfig); if (deviceIds.has_value()) { parallelConfig.setDeviceIds(deviceIds.value()); } executorConfig.setParallelConfig(parallelConfig); } else { if (deviceIds.has_value()) { auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); executorConfig.setParallelConfig(parallelConfig); } } auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig); // Create the request SizeType32 maxNewTokens = 50; VecTokens inputTokens{1, 2, 3, 4}; std::vector requests; for (auto streaming : {false, true}) { // Add two requests with numReturnSequences = 1 auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig); // Add a request with numReturnSequences > 1 auto samplingConfig2 = tensorrt_llm::executor::SamplingConfig(beamWidth); auto constexpr numReturnSequences = 2; samplingConfig2.setNumReturnSequences(numReturnSequences); requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig2, outConfig); } std::vector cancelRequests{true, false, true, true, false, true}; if (executor.canEnqueueRequests()) { auto const requestIds = executor.enqueueRequests(requests); // Cancel the first and third requests std::this_thread::sleep_for(std::chrono::milliseconds(50)); for (SizeType32 i = 0; i < requests.size(); i++) { if (cancelRequests.at(i)) { executor.cancelRequest(requestIds.at(i)); } } std::unordered_map isStreaming; std::unordered_map expectedNumTokens; SizeType32 expectedNumResponses = 0; for (SizeType32 i = 0; i < requests.size(); i++) { auto const& request = requests.at(i); auto requestId = requestIds.at(i); isStreaming[requestId] = request.getStreaming(); expectedNumTokens[requestId] = (request.getStreaming() ? 0 : inputTokens.size()) + maxNewTokens; auto const numResponses = request.getStreaming() ? expectedNumTokens[requestId] : 1; auto const numReturnSequences = request.getSamplingConfig().getBeamWidth() > 1 ? 1 : request.getSamplingConfig().getNumReturnSequences().value_or(1); expectedNumResponses += numResponses * numReturnSequences; } std::unordered_map> tokens; // Get the new tokens for each requests int32_t numFinished = 0; int iter = 0; SizeType32 numResponses = 0; while (numFinished < requests.size() && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { numResponses++; if (!response.hasError()) { auto requestId = response.getRequestId(); auto result = response.getResult(); numFinished += result.isFinal; auto seqIdx = result.sequenceIndex; auto numSequences = result.outputTokenIds.size(); auto& newTokens = result.outputTokenIds.at(numSequences - 1); auto& reqResults = tokens[response.getRequestId()]; auto& reqTokens = reqResults[seqIdx]; if (isStreaming.at(requestId) && beamWidth > 1) { reqTokens = newTokens; } else { reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end()); } } else { FAIL() << "Did not expect errors"; } } ++iter; } EXPECT_LE(numResponses, expectedNumResponses); EXPECT_EQ(numFinished, requests.size()); EXPECT_LT(iter, mMaxWaitMs); for (auto requestIdx = 0; requestIdx < requests.size(); requestIdx++) { auto const requestId = requestIds.at(requestIdx); for (auto seqIdx = 0; seqIdx < tokens.at(requestId).size(); seqIdx++) { auto const& seqTokens = tokens.at(requestId).at(seqIdx); if (cancelRequests.at(requestIdx)) { EXPECT_LT(seqTokens.size(), expectedNumTokens.at(requestId)); } else { EXPECT_EQ(seqTokens.size(), expectedNumTokens.at(requestId)); } } } } } TEST_P(LeaderApiUsageTest, LeaderModeTest) { auto const modelName = std::get<0>(GetParam()); SizeType32 beamWidth = 2; OutputConfig outConfig; std::optional> deviceIds = std::nullopt; auto executorConfig = ExecutorConfig(beamWidth); std::filesystem::path modelPath; if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1") { if (modelName == "llama_tp4_pp1_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu"; } else if (modelName == "llama_tp1_pp4_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu"; deviceIds = std::vector{3, 2, 1, 0}; } else if (modelName == "llama_tp2_pp2_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu"; deviceIds = std::vector{2, 3, 0, 1}; } } // For llama model, only run for multiple GPUs // This is detected by setting an env variable when running the test char const* val = getenv("RUN_LLAMA_MULTI_GPU"); if (val == NULL) { GTEST_SKIP() << "Skipping Llama test"; } else { // Check that it was launched with right number of MPI ranks if (COMM_SESSION.getSize() != 4) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Leader mode and world size is not equal to 4"; } } if (deviceIds.has_value()) { auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); executorConfig.setParallelConfig(parallelConfig); } auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig); // Since this is leader mode, all ranks should participate EXPECT_TRUE(executor.isParticipant()); // Create the request SizeType32 maxNewTokens = 50; VecTokens inputTokens{1, 2, 3, 4}; auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); auto requestStreaming = Request(inputTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig); // Leader enqueues requests and wait for responses if (executor.canEnqueueRequests()) { auto requestId = executor.enqueueRequest(request); auto requestId2 = executor.enqueueRequest(request); auto requestId3 = executor.enqueueRequest(requestStreaming); auto requestId4 = executor.enqueueRequest(requestStreaming); int32_t numFinished = 0; int iter = 0; SizeType32 numResponses = 0; while (numFinished < 4 && iter < mMaxWaitMs) { std::chrono::milliseconds waitTime(1); auto responses = executor.awaitResponses(waitTime); for (auto& response : responses) { numResponses++; if (!response.hasError()) { auto result = response.getResult(); numFinished += result.isFinal; } else { FAIL() << "Did not expect errors"; } } ++iter; } EXPECT_EQ(numFinished, 4); EXPECT_LT(iter, mMaxWaitMs); } else { // Check that non-leader cannot enqueue requests EXPECT_THROW({ auto reqId = executor.enqueueRequest(request); }, tensorrt_llm::common::TllmException); EXPECT_THROW({ auto responses = executor.awaitResponses(); }, tensorrt_llm::common::TllmException); EXPECT_THROW({ auto numResp = executor.getNumResponsesReady(); }, tensorrt_llm::common::TllmException); EXPECT_THROW({ executor.cancelRequest(1); }, tensorrt_llm::common::TllmException); EXPECT_THROW({ auto stats = executor.getLatestIterationStats(); }, tensorrt_llm::common::TllmException); EXPECT_THROW({ auto stats = executor.getLatestRequestStats(); }, tensorrt_llm::common::TllmException); } } TEST_F(GptExecutorTest, validateParallelConfig) { auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"); { auto executorConfig = ExecutorConfig(); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); } { std::string expectedErrMsg = "OrchestratorConfig must be set"; try { auto executorConfig = ExecutorConfig(); auto parallelConfig = ParallelConfig(CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR); executorConfig.setParallelConfig(parallelConfig); auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig); FAIL() << "Expected TllmException"; } catch (tc::TllmException& e) { EXPECT_THAT(e.what(), testing::HasSubstr(expectedErrMsg)); } catch (std::exception const& e) { FAIL() << "Expected TllmException"; } } } TEST_P(TimeoutTest, TimeoutStreamingTest) { auto const modelName = std::get<0>(GetParam()); auto const useOrchestratorMode = std::get<1>(GetParam()); auto const beamWidth = std::get<2>(GetParam()); auto executorConfig = ExecutorConfig(beamWidth); std::filesystem::path modelPath; bool isMultiGpu{false}; std::optional> deviceIds = std::nullopt; if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1") { isMultiGpu = true; if (modelName == "llama_tp4_pp1_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu"; } else if (modelName == "llama_tp1_pp4_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu"; deviceIds = std::vector{3, 2, 1, 0}; } else if (modelName == "llama_tp2_pp2_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu"; deviceIds = std::vector{2, 3, 0, 1}; } } if (modelName == "llama_tp1_pp1_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; } // For llama model, only run for multiple GPUs // This is detected by setting an env variable when running the test char const* val = getenv("RUN_LLAMA_MULTI_GPU"); if (val == NULL && isMultiGpu) { GTEST_SKIP() << "Skipping MultiGpu tests"; } if (val != NULL && !isMultiGpu) { GTEST_SKIP() << "Skipping SingleGpu tests"; } if (val != NULL && isMultiGpu) { // Check that it was launched with right number of MPI ranks if (!useOrchestratorMode && COMM_SESSION.getSize() != 4) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Leader mode and world size is not equal to 4"; } if (useOrchestratorMode && COMM_SESSION.getSize() != 1) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Orchestrator mode and World size is not equal to 1"; } } if (useOrchestratorMode) { auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); auto parallelConfig = ParallelConfig(CommunicationType::kMPI, useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); if (deviceIds.has_value()) { parallelConfig.setDeviceIds(deviceIds.value()); } executorConfig.setParallelConfig(parallelConfig); } else { if (deviceIds.has_value()) { auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); executorConfig.setParallelConfig(parallelConfig); } } auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 constexpr maxNewTokens = 10; // create 1 request that times out immediately // momentarily we don't cancel requests before forwardAsync so it will get scheduled for at least 1 forward VecTokens immediateCancelTokens{1, 2, 3, 4}; auto immediateCancelRequest = Request(immediateCancelTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth)); immediateCancelRequest.setReturnAllGeneratedTokens(true); immediateCancelRequest.setAllottedTimeMs(std::chrono::milliseconds(0)); SizeType32 constexpr immediateCancelMinLength = 0; SizeType32 constexpr immediateCancelMaxLength = 1; // create 1 request that times out during the first forward VecTokens oneForwardTokens{11, 12, 13, 14}; auto oneForwardRequest = Request(oneForwardTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth)); oneForwardRequest.setReturnAllGeneratedTokens(true); oneForwardRequest.setAllottedTimeMs(std::chrono::milliseconds(1)); SizeType32 constexpr oneForwardlMinLength = 0; SizeType32 constexpr oneForwardlMaxLength = 1; // Create the request that finishes by the number of tokens VecTokens finishedTokens{101, 102, 103, 104}; auto finishedRequest = Request(finishedTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth)); finishedRequest.setReturnAllGeneratedTokens(true); finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(5000)); SizeType32 constexpr finishedMinLength = 5; SizeType32 constexpr finishedMaxLength = maxNewTokens; std::vector referenceFinishReasons = {FinishReason::kTIMED_OUT, FinishReason::kTIMED_OUT, FinishReason::kLENGTH}; std::vector minLengths = {immediateCancelMinLength, oneForwardlMinLength, finishedMinLength}; std::vector maxLengths = {immediateCancelMaxLength, oneForwardlMaxLength, finishedMaxLength}; // workaround because the last response will be empty, but we want to have at least *some* responses surpass the // minLength std::vector achievedLength = {0, 0, 0}; SizeType32 itNr{0}; if (executor.canEnqueueRequests()) { std::vector requests = {immediateCancelRequest, oneForwardRequest, finishedRequest}; auto requestIds = executor.enqueueRequests(requests); auto numFinished = 0; while (numFinished < static_cast(requests.size())) { itNr++; std::chrono::milliseconds waitTime(mMaxWaitMs); auto responses = executor.awaitResponses(requestIds, waitTime); for (auto const& response : responses) { for (auto const& responseIt : response) { auto const reqId = responseIt.getRequestId(); if (responseIt.hasError()) { // Allow response with error only if awaitResponse processed a terminated request id std::string err = "ReqId " + std::to_string(reqId) + " has already been processed and was terminated."; if (responseIt.getErrorMsg() != err) { TLLM_THROW("Request id %lu encountered error: %s", reqId, responseIt.getErrorMsg().c_str()); } continue; } auto const& result = responseIt.getResult(); if (result.isFinal) { requestIds.erase(std::remove(requestIds.begin(), requestIds.end(), reqId), requestIds.end()); numFinished++; } auto const finishReason = result.finishReasons; auto const actualResponse = result.outputTokenIds; TLLM_LOG_DEBUG("reqId %d finished %d", reqId, result.isFinal); TLLM_LOG_DEBUG("actual response:"); for (auto const& beam : actualResponse) { std::string tokenStr; for (auto tok : beam) { tokenStr += std::to_string(tok) + " "; } TLLM_LOG_DEBUG("%s", tokenStr.c_str()); } TLLM_LOG_DEBUG( "beams' length must be in range [%d, %d]", minLengths[reqId - 1], maxLengths[reqId - 1]); if (result.isFinal) { TLLM_LOG_DEBUG("finishReason"); std::string reasonStr; for (auto const reason : finishReason) { // cast for easier visibility during debugging EXPECT_EQ(static_cast(reason), static_cast(referenceFinishReasons[reqId - 1])); reasonStr += std::to_string(static_cast(reason)) + " "; } TLLM_LOG_DEBUG("%s", reasonStr.c_str()); } EXPECT_EQ(beamWidth, actualResponse.size()); for (int beam = 0; beam < beamWidth; beam++) { EXPECT_LE(actualResponse.at(beam).size(), maxLengths[reqId - 1]) << "for request " << reqId; achievedLength[reqId - 1] = std::max( achievedLength[reqId - 1], static_cast(actualResponse.at(beam).size())); } } } } for (int reqIt = 0; reqIt < achievedLength.size(); ++reqIt) { EXPECT_GE(achievedLength[reqIt], minLengths[reqIt]) << "request " << reqIt + 1 << " has not achieved min lengths"; } } } TEST_P(TimeoutTest, TimeoutNonstreamingTest) { auto const modelName = std::get<0>(GetParam()); auto const useOrchestratorMode = std::get<1>(GetParam()); auto const beamWidth = std::get<2>(GetParam()); std::optional> deviceIds = std::nullopt; auto executorConfig = ExecutorConfig(beamWidth); std::filesystem::path modelPath; bool isMultiGpu{false}; if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1") { isMultiGpu = true; if (modelName == "llama_tp4_pp1_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu"; } else if (modelName == "llama_tp1_pp4_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu"; deviceIds = std::vector{3, 2, 1, 0}; } else if (modelName == "llama_tp2_pp2_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu"; deviceIds = std::vector{2, 3, 0, 1}; } } if (modelName == "llama_tp1_pp1_cp1") { modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu"; } // For llama model, only run for multiple GPUs // This is detected by setting an env variable when running the test char const* val = getenv("RUN_LLAMA_MULTI_GPU"); if (val == NULL && isMultiGpu) { GTEST_SKIP() << "Skipping MultiGpu tests"; } if (val != NULL && !isMultiGpu) { GTEST_SKIP() << "Skipping SingleGpu tests"; } if (val != NULL && isMultiGpu) { // Check that it was launched with right number of MPI ranks if (!useOrchestratorMode && COMM_SESSION.getSize() != 4) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Leader mode and world size is not equal to 4"; } if (useOrchestratorMode && COMM_SESSION.getSize() != 1) { // No orchestrator, need worldSize to match TP*PP FAIL() << "Orchestrator mode and World size is not equal to 1"; } } if (useOrchestratorMode) { auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH()); auto parallelConfig = ParallelConfig(CommunicationType::kMPI, useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt, std::nullopt, orchestratorConfig); executorConfig.setParallelConfig(parallelConfig); if (deviceIds.has_value()) { parallelConfig.setDeviceIds(deviceIds.value()); } executorConfig.setParallelConfig(parallelConfig); } else { if (deviceIds.has_value()) { auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig()); parallelConfig.setDeviceIds(deviceIds.value()); executorConfig.setParallelConfig(parallelConfig); } } auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig); SizeType32 constexpr maxNewTokens = 5; // create 1 request that times out immediately // momentarily we don't cancel requests before forwardAsync so it will get scheduled for at least 1 forward VecTokens immediateCancelTokens{1, 2, 3, 4}; auto immediateCancelRequest = Request(immediateCancelTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth)); immediateCancelRequest.setAllottedTimeMs(std::chrono::milliseconds(0)); std::vector> immediateCancelResponse = {immediateCancelTokens, immediateCancelTokens}; // create 1 request that times out during the first forward VecTokens oneForwardTokens{11, 12, 13, 14}; auto oneForwardRequest = Request(oneForwardTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth)); oneForwardRequest.setAllottedTimeMs(std::chrono::milliseconds(1)); std::vector> oneForwardResponse = {oneForwardTokens, oneForwardTokens}; // Create the request that finishes by the number of tokens VecTokens finishedTokens{101, 102, 103, 104}; auto finishedRequest = Request(finishedTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth)); finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(6000)); std::vector> finishedReponse = {{101, 102, 103, 104, 49849, 225, 49849, 232, 55742}, {101, 102, 103, 104, 49849, 225, 49849, 232, 29082}}; // assume responses will come in FIFO order std::vector refResponses = {immediateCancelResponse, oneForwardResponse, finishedReponse}; std::vector referenceFinishReasons = {FinishReason::kTIMED_OUT, FinishReason::kTIMED_OUT, FinishReason::kLENGTH}; if (executor.canEnqueueRequests()) { std::vector requests = {immediateCancelRequest, oneForwardRequest, finishedRequest}; auto requestIds = executor.enqueueRequests(requests); std::chrono::milliseconds waitTime(mMaxWaitMs); auto responses = executor.awaitResponses(requestIds, waitTime); for (auto const& response : responses) { for (auto const& responseIt : response) { auto const reqId = responseIt.getRequestId(); if (responseIt.hasError()) { TLLM_THROW("Request id %lu encountered error: %s", reqId, responseIt.getErrorMsg().c_str()); } auto const& result = responseIt.getResult(); auto const finishReason = result.finishReasons; auto const actualResponse = result.outputTokenIds; TLLM_LOG_DEBUG("reqId %d finished %d", reqId, result.isFinal); TLLM_LOG_DEBUG("actual response:"); for (auto const& beam : actualResponse) { std::string tokenStr; for (auto tok : beam) { tokenStr += std::to_string(tok) + " "; } TLLM_LOG_DEBUG("%s", tokenStr.c_str()); } TLLM_LOG_DEBUG("reference:"); auto referenceResponse = refResponses[reqId - 1]; for (auto const& beam : referenceResponse) { std::string tokenStr; for (auto tok : beam) { tokenStr += std::to_string(tok) + " "; } TLLM_LOG_DEBUG("%s", tokenStr.c_str()); } if (result.isFinal) { TLLM_LOG_DEBUG("finishReason"); std::string reasonStr; for (auto const reason : finishReason) { // cast for easier visibility during debugging EXPECT_EQ(static_cast(reason), static_cast(referenceFinishReasons[reqId - 1])); reasonStr += std::to_string(static_cast(reason)) + " "; } TLLM_LOG_DEBUG("%s", reasonStr.c_str()); } EXPECT_EQ(beamWidth, actualResponse.size()); for (int beam = 0; beam < beamWidth; beam++) { EXPECT_EQ(referenceResponse.at(beam).size(), actualResponse.at(beam).size()); EXPECT_THAT(actualResponse.at(beam), testing::ElementsAreArray(referenceResponse.at(beam))); } } } } } INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamTest, testing::Combine( // testing::Values(false, true), // streaming testing::Values(false, true), // excludeInputFromOutput testing::Values(1, 2) // beamWidth ), generateTestName); INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamStatsTest, testing::Combine( // testing::Values(0, 1000), // iterStatsMaxIterations testing::Values(false, true) // useOrchestratorMode ), generateTestNameStats); INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, ParamCancelReqTest, testing::Combine( // testing::Values(false, true), // useOrchestratorMode testing::Values(1, 2), // beamWidth testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName ), generateTestNameCancelReq); INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, TimeoutTest, testing::Combine( // testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp1_pp1_cp1"), // modelName testing::Values(false, true), // useOrchestratorMode testing::Values(2) // beamWidth ), generateTestNameTimeoutTest); INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LeaderApiUsageTest, testing::Combine( // testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName ), generateTestNameLeaderApiUsage); INSTANTIATE_TEST_SUITE_P(GptExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false, true), // streaming testing::Values(1, 2), // beamWidth testing::Values(true), // computeLogProbs testing::Values(false, true), // excludeInputInOutput testing::Values(true), // returnContextLogits testing::Values(true), // returnGenerationLogits testing::Values("gpt"), // modelName testing::Values(false, true), // useOrchestratorMode testing::Values(false, true), // returnAllGeneratedTokens testing::Values(1, 2) // numReturnSequences ), generateTestNameAllParams); INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false, true), // streaming testing::Values(1, 2), // beamWidth testing::Values(true), // computeLogProbs testing::Values(false, true), // excludeInputInOutput testing::Values(false), // returnContextLogits testing::Values(true), // returnGenerationLogits testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1"), // modelName testing::Values(false, true), // useOrchestratorMode testing::Values(false), // returnAllGeneratedTokens testing::Values(1) // numReturnSequences ), generateTestNameAllParams); INSTANTIATE_TEST_SUITE_P(LlamaMultiExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false, true), // streaming testing::Values(1, 2), // beamWidth testing::Values(false), // computeLogProbs testing::Values(false, true), // excludeInputInOutput testing::Values(false), // returnContextLogits testing::Values(false), // returnGenerationLogits testing::Values("llama_tp1_pp2_cp1"), // modelName testing::Values(false), // useOrchestratorMode testing::Values(false), // returnAllGeneratedTokens testing::Values(1) // numReturnSequences ), generateTestNameAllParams); INSTANTIATE_TEST_SUITE_P(MedusaExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false, true), // streaming testing::Values(1), // beamWidth testing::Values(false), // computeLogProbs testing::Values(false, true), // excludeInputInOutput testing::Values(false), // returnContextLogits testing::Values(false), // returnGenerationLogits testing::Values("medusa"), // modelName testing::Values(false, true), // useOrchestratorMode testing::Values(false), // returnAllGeneratedTokens testing::Values(1) // numReturnSequences ), generateTestNameAllParams); // Disable some of ChatGLM's tests since they are the same as gpt's. INSTANTIATE_TEST_SUITE_P(ChatGlmExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false), // streaming testing::Values(1, 2), // beamWidth testing::Values(false), // computeLogProbs testing::Values(false), // excludeInputInOutput testing::Values(false), // returnContextLogits testing::Values(false), // returnGenerationLogits testing::Values("chatglm"), // modelName testing::Values(false), // useOrchestratorMode testing::Values(false), // returnAllGeneratedTokens testing::Values(1, 2) // numReturnSequences ), generateTestNameAllParams); // ChatGlm0 Test is for glm-10b. INSTANTIATE_TEST_SUITE_P(ChatGlm0ExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false), // streaming testing::Values(1), // beamWidth testing::Values(false), // computeLogProbs testing::Values(false), // excludeInputInOutput testing::Values(false), // returnContextLogits testing::Values(false), // returnGenerationLogits testing::Values("glm"), // modelName testing::Values(false), // useOrchestratorMode testing::Values(false), // returnAllGeneratedTokens testing::Values(1) // numReturnSequences ), generateTestNameAllParams); INSTANTIATE_TEST_SUITE_P(ChatGlm2ExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false), // streaming testing::Values(1), // beamWidth testing::Values(false), // computeLogProbs testing::Values(false), // excludeInputInOutput testing::Values(false), // returnContextLogits testing::Values(false), // returnGenerationLogits testing::Values("chatglm2"), // modelName testing::Values(false), // useOrchestratorMode testing::Values(false), // returnAllGeneratedTokens testing::Values(1) // numReturnSequences ), generateTestNameAllParams); INSTANTIATE_TEST_SUITE_P(ChatGlm3ExecutorTest, AllParamsTest, testing::Combine( // testing::Values(false), // streaming testing::Values(1), // beamWidth testing::Values(false), // computeLogProbs testing::Values(false), // excludeInputInOutput testing::Values(false), // returnContextLogits testing::Values(false), // returnGenerationLogits testing::Values("chatglm3"), // modelName testing::Values(false), // useOrchestratorMode testing::Values(false), // returnAllGeneratedTokens testing::Values(1) // numReturnSequences ), generateTestNameAllParams); INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LogitsProcParamsTest, testing::Combine( // testing::Values( "llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1"), // modelName testing::Values(false, true), // batched testing::Values(false, true) // replicated ), generateTestNameLogitsProc); INSTANTIATE_TEST_SUITE_P(GptExecutorGuidedDecodingTest, GuidedDecodingParamsTest, testing::Combine(testing::Values("gpt")), generateTestNameGuidedDecoding); INSTANTIATE_TEST_SUITE_P(LlamaExecutorGuidedDecodingTest, GuidedDecodingParamsTest, testing::Combine( testing::Values("llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1")), generateTestNameGuidedDecoding);