TensorRT-LLMs/cpp/tests/executor/executorTest.cpp
Robin Kobus 4cd8543d8c
[TRTLLM-1316] refactor: Remove unnecessary pipeline parallelism logic from postProcessRequest (#5489)
Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
2025-07-02 10:13:31 +02:00

4660 lines
185 KiB
C++

/*
* SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
*/
#ifndef TOP_LEVEL_DIR
#error "Define TOP_LEVEL_DIR"
#endif
#include "executorTest.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/executor/dataTransceiverState.h"
#include "tensorrt_llm/executor/requestWithId.h"
#include "tensorrt_llm/executor/types.h"
#include "tensorrt_llm/executor/version.h"
#include "tensorrt_llm/runtime/gptJsonConfig.h"
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
#include "tensorrt_llm/runtime/utils/numpyUtils.h"
#include "tensorrt_llm/testing/modelSpec.h"
#include "tests/utils/common.h"
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <nlohmann/json.hpp>
#include <algorithm>
#include <chrono>
#include <cstddef>
#include <functional>
#include <memory>
#include <string>
#include <thread>
#include <vector>
namespace tr = tensorrt_llm::runtime;
namespace tc = tensorrt_llm::common;
using namespace tensorrt_llm::testing;
using namespace tensorrt_llm::executor;
using namespace std::chrono_literals;
namespace fs = std::filesystem;
using tensorrt_llm::testing::KVCacheType;
using tensorrt_llm::testing::ModelSpec;
namespace
{
auto const LORA_DATA_PATH = DATA_PATH / "lora-test-weights-gpt2-tp1";
auto const LORA_WEIGHTS_FILE = LORA_DATA_PATH / "source.npy";
auto const LORA_CONFIG_FILE = LORA_DATA_PATH / "config.npy";
auto constexpr LLAMA_INPUT_FILE = "input_tokens_llama.npy";
auto constexpr LLAMA_VOCAB_SIZE_PADDED = 128256;
auto constexpr LLAMA_PAD_ID = 128001;
auto constexpr LLAMA_END_ID = 128001;
} // namespace
void testInvalidCtor(std::filesystem::path const& enginePath, ModelType modelType, ExecutorConfig executorConfig,
std::string expectedErrMsg = "")
{
try
{
auto executor = Executor(enginePath, modelType, executorConfig);
FAIL() << "Expected TllmException";
}
catch (std::exception const& e)
{
EXPECT_THAT(e.what(), testing::HasSubstr(expectedErrMsg));
}
}
TEST_F(GptExecutorTest, version)
{
EXPECT_STRNE(kTensorRtLlmVersion, "@TRTLLM_VERSION@");
EXPECT_STREQ(kTensorRtLlmVersion, version());
}
TEST_F(GptExecutorTest, validCtor)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
}
TEST_F(GptExecutorTest, invalidCtor)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
std::filesystem::path invalidPath{"Bla"};
// Invalid path
{
testInvalidCtor(invalidPath, ModelType::kDECODER_ONLY, executorConfig, "File does not exist");
}
}
TEST_F(GptExecutorTest, enqueueAfterShutdown)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4};
auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
auto requestId = executor.enqueueRequest(request);
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
FAIL();
}
else
{
done = response.getResult().isFinal;
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
executor.shutdown();
EXPECT_FALSE(executor.canEnqueueRequests());
std::string expErrMsg{"Shutdown called"};
EXPECT_THAT([&]() { auto reqId = executor.enqueueRequest(request); },
testing::Throws<tensorrt_llm::common::TllmException>(
testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
EXPECT_THAT([&]() { auto resp = executor.awaitResponses(); },
testing::Throws<tensorrt_llm::common::TllmException>(
testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
EXPECT_THAT([&]() { auto stats = executor.getLatestIterationStats(); },
testing::Throws<tensorrt_llm::common::TllmException>(
testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
EXPECT_THAT([&]() { auto stats = executor.getLatestRequestStats(); },
testing::Throws<tensorrt_llm::common::TllmException>(
testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
EXPECT_THAT([&]() { executor.cancelRequest(requestId); },
testing::Throws<tensorrt_llm::common::TllmException>(
testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
}
TEST_F(GptExecutorTest, missingPeftTask)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_LORA_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4};
auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
auto loraConfig = LoraConfig{10};
request.setLoraConfig(loraConfig);
auto requestId = executor.enqueueRequest(request);
bool done = false;
std::chrono::milliseconds waitTime(mMaxWaitMs);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
auto err = response.getErrorMsg();
EXPECT_EQ(err, std::string("LoRA task 10 not found in cache. Please send LoRA weights with request"));
done = true;
}
else
{
FAIL() << "Expects error due to missing Lora weights";
}
}
EXPECT_TRUE(done);
}
TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits)
{
SizeType32 constexpr beamWidth{1};
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
// Create executor config
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setGatherGenerationLogits(true);
// Enable kv cache reuse of executorConfig
bool enableBlockReuse = true;
FloatType freeGpuMemoryFraction = 0.4;
auto kvCacheConfig
= KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
executorConfig.setKvCacheConfig(kvCacheConfig);
// Create executor
auto trtEnginePath
= (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DRAFT_TOKENS_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create request
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4, 5, 6, 7, 8};
std::vector<bool> streamingOptions{false, true};
for (auto streaming : streamingOptions)
{
auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth));
// Set draft tokens
auto draftTokens = VecTokens{9, 10, 11, 12, 13}; // draft tokens
auto draftLength = draftTokens.size();
FloatType const acceptanceThreshold = 0.00001f; // Ensure the draft token can be accepted
auto externalDraftTokensConfig = ExternalDraftTokensConfig(draftTokens, std::nullopt, acceptanceThreshold);
request.setExternalDraftTokensConfig(externalDraftTokensConfig);
// Set return accepted token logits for this request
OutputConfig outConfig;
outConfig.returnGenerationLogits = true;
request.setOutputConfig(outConfig);
// Enqueue this request
auto requestId = executor.enqueueRequest(request);
bool done = false;
int iter = 0;
while (!done && iter < 5000)
{
std::chrono::milliseconds waitTime(mMaxWaitMs);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
FAIL();
}
else
{
auto result = response.getResult();
done = result.isFinal;
auto& genLogits = result.generationLogits;
EXPECT_TRUE(genLogits.has_value());
// Expected shape: (1, numAcceptedDraftToken, vocabSizePadded)
auto const& acceptedTokenLogitsShape = genLogits->getShape();
EXPECT_EQ(acceptedTokenLogitsShape.size(), 3);
EXPECT_EQ(acceptedTokenLogitsShape[0], 1);
EXPECT_LE(acceptedTokenLogitsShape[1], draftLength); // number of accepted tokens
EXPECT_EQ(acceptedTokenLogitsShape[2], vocabSizePadded); // vocabSizePadded
}
}
++iter;
}
}
}
TEST_F(GptExecutorTest, GenerationLogitsEarlyStop)
{
SizeType32 constexpr beamWidth{1};
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
auto constexpr streaming = false;
ExtendedRuntimePerfKnobConfig perfKnobConfig = ExtendedRuntimePerfKnobConfig();
// Create executor config
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setExtendedRuntimePerfKnobConfig(perfKnobConfig);
executorConfig.setGatherGenerationLogits(true);
// Create executor
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
auto const inputPath = DATA_PATH / "input_tokens.npy";
ModelIds modelIds{50256, 50256};
auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
auto const* const givenInputData = tr::bufferCast<TokenIdType const>(*givenInput);
auto const& inputShape = givenInput->getShape();
ASSERT_EQ(inputShape.nbDims, 2);
ASSERT_GT(inputShape.d[0], 0);
BeamResult beamResult{beamWidth};
auto const resultsPath
= GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();
// Set return generation logits for this request
OutputConfig outConfig;
outConfig.returnGenerationLogits = true;
outConfig.excludeInputFromOutput = true;
// Load expected outputs for each beam width value
auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds);
auto const maxSeqLen = testData.maxSeqLen;
// Load expected outputs and inputs
std::vector<Request> requests;
std::vector<SizeType32> reqMaxNewTokens;
auto constexpr reqIdx = 0;
SizeType32 inputLen = givenInputLengths.at(reqIdx);
auto maxNewTokens = maxSeqLen - maxInputLength;
reqMaxNewTokens.push_back(maxNewTokens);
auto const* const seqBegin = givenInputData + reqIdx * maxInputLength;
auto request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId);
// copy request
auto request2 = request;
auto const expectedOutputData = tr::BufferRange<TokenIdType const>(*testData.expectedOutputIds);
auto const expectedOutputLengths = testData.expectedOutputLengths;
auto const endPos = expectedOutputLengths[reqIdx] - 3;
auto const endIndex = tc::flat_index3(reqIdx, beamWidth - 1, endPos, beamWidth, maxSeqLen);
auto const endToken = expectedOutputData[endIndex];
// Set end id to stop early
request.setEndId(endToken);
requests.emplace_back(std::move(request));
// Set stop words to stop early
request2.setStopWords({{endToken}});
requests.emplace_back(std::move(request2));
// Enqueue requests
auto requestIds = executor.enqueueRequests(requests);
std::map<IdType, SizeType32> expectedNewTokens;
expectedNewTokens[requestIds.at(0)] = endPos - inputLen;
expectedNewTokens[requestIds.at(1)] = endPos - inputLen + 1;
std::map<IdType, FinishReason> expectedFinishReason;
expectedFinishReason[requestIds.at(0)] = FinishReason::kEND_ID;
expectedFinishReason[requestIds.at(1)] = FinishReason::kSTOP_WORDS;
std::map<IdType, bool> done;
std::for_each(requestIds.begin(), requestIds.end(), [&done](auto id) { done[id] = false; });
int iter = 0;
while (!(std::all_of(done.begin(), done.end(), [](auto x) { return x.second; })) && iter < 5000)
{
std::chrono::milliseconds waitTime(mMaxWaitMs);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
FAIL();
}
else
{
auto const reqId = response.getRequestId();
auto const& result = response.getResult();
EXPECT_TRUE(result.isFinal);
done.at(reqId) = result.isFinal;
// only 1 beam
auto const& outputIds = result.outputTokenIds.at(0);
EXPECT_EQ(outputIds.size(), expectedNewTokens.at(reqId)) << "req " << reqId;
auto const& finishReason = result.finishReasons.at(0);
EXPECT_EQ(finishReason, expectedFinishReason.at(reqId)) << "req " << reqId;
auto const& genLogits = result.generationLogits;
EXPECT_TRUE(genLogits.has_value());
// Expected shape: (1, numAcceptedDraftToken, vocabSizePadded)
auto const& generationLogitsShape = genLogits->getShape();
EXPECT_EQ(generationLogitsShape.size(), 3);
EXPECT_EQ(generationLogitsShape[0], 1);
EXPECT_LE(generationLogitsShape[1], maxNewTokens);
EXPECT_EQ(generationLogitsShape[2], vocabSizePadded);
auto const genLogitsTensor = detail::toITensor(*genLogits);
genLogitsTensor->squeeze(0); // only 1 beam
for (size_t outputIdx = 0; outputIdx < expectedNewTokens.at(reqId); ++outputIdx)
{
// logits argmax should be equal to tokenId
auto const genLogitsSlice = tr::ITensor::slice(genLogitsTensor, outputIdx, 1);
auto const genLogitsRange = tr::BufferRange<float>(*genLogitsSlice);
auto const* maxPos = std::max_element(genLogitsRange.begin(), genLogitsRange.end());
auto const maxIdx = std::distance(genLogitsRange.begin(), maxPos);
auto const tokenId = outputIds.at(outputIdx);
// Observed token mismatch at index 2 after building GPT engine with TRT builder optimization
// level 3. The testcase is sensitive to slight variation in kernel computation, so we skip checking
// for token id at index 2.
if (outputIdx != 2)
{
EXPECT_EQ(tokenId, maxIdx) << "req " << reqId << " outputIdx " << outputIdx;
}
}
}
}
++iter;
}
}
TEST_F(GptExecutorTest, GenerationChangeEndId)
{
SizeType32 constexpr beamWidth{2};
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
auto constexpr streaming = false;
ExtendedRuntimePerfKnobConfig perfKnobConfig = ExtendedRuntimePerfKnobConfig();
perfKnobConfig.setEnableContextFMHAFP32Acc(true); // use fmha fp32 acc for better accuracy
// Create executor config
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setExtendedRuntimePerfKnobConfig(perfKnobConfig);
// Create executor
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
auto const inputPath = DATA_PATH / "input_tokens.npy";
ModelIds modelIds{50256, 50256};
auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
auto const* const givenInputData = tr::bufferCast<TokenIdType const>(*givenInput);
auto const& inputShape = givenInput->getShape();
ASSERT_EQ(inputShape.nbDims, 2);
ASSERT_GT(inputShape.d[0], 0);
BeamResult beamResult{beamWidth};
auto const resultsPath
= GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_CONTEXTFMHAFP32ACC_RESULT_FILE();
// Just return tokens for check
OutputConfig outConfig;
outConfig.excludeInputFromOutput = true;
// Load expected outputs for each beam width value
auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds);
auto const maxSeqLen = testData.maxSeqLen;
// Load expected outputs and inputs
std::vector<Request> requests;
std::vector<SizeType32> reqMaxNewTokens;
// Only use the first request to test
auto constexpr reqIdx = 0;
SizeType32 inputLen = givenInputLengths.at(reqIdx);
auto maxNewTokens = maxSeqLen - maxInputLength;
reqMaxNewTokens.push_back(maxNewTokens);
auto const* const seqBegin = givenInputData + reqIdx * maxInputLength;
// Use customized `EndId` to enqueue once
auto request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId);
TokenIdType customizedEndId = *(seqBegin + 1); // Use a token appeared in ground-truth
request.setEndId(customizedEndId);
requests.emplace_back(std::move(request));
auto requestIds = executor.enqueueRequests(requests);
std::chrono::milliseconds waitTime(mMaxWaitMs);
auto responses = executor.awaitResponses(waitTime);
if (responses.at(0).hasError())
{
FAIL();
}
requests.clear();
// Change back to default `EndId` to enqueue again, and check the output
request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId);
auto const expectedOutputData = tr::BufferRange<TokenIdType const>(*testData.expectedOutputIds);
auto const expectedOutputLengths = testData.expectedOutputLengths;
auto const endPos = expectedOutputLengths[reqIdx];
auto const endIndex = tc::flat_index3(reqIdx, beamWidth, endPos, beamWidth, maxSeqLen);
auto const endToken = expectedOutputData[endIndex];
request.setEndId(endToken);
requests.emplace_back(std::move(request));
requestIds = executor.enqueueRequests(requests);
auto const requestId = requestIds.at(0);
std::map<IdType, SizeType32> expectedNewTokens;
expectedNewTokens[requestId] = endPos - inputLen;
std::map<IdType, FinishReason> expectedFinishReason;
expectedFinishReason[requestId] = FinishReason::kLENGTH;
std::map<IdType, bool> done;
std::for_each(requestIds.begin(), requestIds.end(), [&done](auto id) { done[id] = false; });
int iter = 0;
while (!(std::all_of(done.begin(), done.end(), [](auto x) { return x.second; })) && iter < 5000)
{
std::chrono::milliseconds waitTime(mMaxWaitMs);
auto responses = executor.awaitResponses(waitTime);
auto& response = responses.at(0);
if (response.hasError())
{
FAIL();
}
else
{
auto const reqId = response.getRequestId();
auto const& result = response.getResult();
EXPECT_TRUE(result.isFinal);
done.at(reqId) = result.isFinal;
bool anyMismatch = false;
for (int i = 0; i < result.outputTokenIds.size(); ++i)
{
auto const& outputIds = result.outputTokenIds.at(i);
EXPECT_EQ(outputIds.size(), expectedNewTokens.at(reqId)) << "req " << reqId;
anyMismatch |= outputIds.size() != expectedNewTokens.at(reqId);
auto const& finishReason = result.finishReasons.at(i);
EXPECT_EQ(finishReason, expectedFinishReason.at(reqId)) << "req " << reqId;
anyMismatch |= finishReason != expectedFinishReason.at(reqId);
if (anyMismatch)
{
break;
}
for (int j = 0; j < outputIds.size(); ++j)
{
auto const resultToken = outputIds[j];
auto const groundTruthToken = expectedOutputData[maxSeqLen * i + inputLen + j];
EXPECT_EQ(resultToken, groundTruthToken);
anyMismatch |= resultToken != groundTruthToken;
}
}
EXPECT_FALSE(anyMismatch);
}
++iter;
}
}
// stream, excludeInputFromOutput, beamWidth
using ParamType = std::tuple<bool, bool, int>;
// useOrchestratorMode, beamWidth, modelName
using ParamCancelReqType = std::tuple<bool, int, std::string>;
// modelName
using LeaderApiUsageType = std::tuple<std::string>;
// iterStatsMaxIterations, useOrchestratorMode
using ParamStatsType = std::tuple<int, bool>;
// streaming, beamWidth, computeLogProbs, excludeInputInOutput, returnContextLogits, returnGenerationLogits, modelName,
// useOrchestratorMode, returnAllGeneratedTokens, numReturnSequences
using AllParamsType = std::tuple<bool, int, bool, bool, bool, bool, std::string, bool, bool, int>;
// modelName, batched, replicated
using LogitsProcParamsType = std::tuple<std::string, bool, bool>;
// modelName
using GuidedDecodingParamsType = std::tuple<std::string>;
// modelName, useOrchestratorMode, beamWidth
using TimeoutTestParamsType = std::tuple<std::string, bool, int>;
std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
{
auto const streaming = std::get<0>(info.param);
auto const excludeInputFromOutput = std::get<1>(info.param);
auto const beamWidth = std::get<2>(info.param);
std::string name = "ExecutorTest";
if (streaming)
{
name += "Streaming";
}
if (excludeInputFromOutput)
{
name += "ExclInput";
}
name.append("BW" + std::to_string(beamWidth));
return name;
}
std::string generateTestNameCancelReq(testing::TestParamInfo<ParamCancelReqType> const& info)
{
auto const& useOrchestratorMode = std::get<0>(info.param);
auto const beamWidth = std::get<1>(info.param);
auto const modelName = std::get<2>(info.param);
std::string name = "ExecutorTest";
name.append("BW" + std::to_string(beamWidth));
name.append("_" + modelName + "_");
if (useOrchestratorMode)
{
name.append("OrchMode");
}
else
{
name.append("LeaderMode");
}
return name;
}
std::string generateTestNameLeaderApiUsage(testing::TestParamInfo<LeaderApiUsageType> const& info)
{
auto const modelName = std::get<0>(info.param);
std::string name = "ExecutorTest";
name.append("_" + modelName);
return name;
}
std::string generateTestNameLogitsProc(testing::TestParamInfo<LogitsProcParamsType> const& info)
{
auto const modelName = std::get<0>(info.param);
bool const batched = std::get<1>(info.param);
bool const replicated = std::get<2>(info.param);
std::string name = "ExecutorTest";
name.append("_" + modelName);
if (batched)
{
name.append("_Batched");
}
if (replicated)
{
name.append("_Replicated");
}
return name;
}
std::string generateTestNameGuidedDecoding(testing::TestParamInfo<GuidedDecodingParamsType> const& info)
{
auto const modelName = std::get<0>(info.param);
std::string name = "ExecutorTest";
name.append("_" + modelName);
return name;
}
std::string generateTestNameTimeoutTest(testing::TestParamInfo<TimeoutTestParamsType> const& info)
{
auto const modelName = std::get<0>(info.param);
auto const& useOrchestratorMode = std::get<1>(info.param);
auto const beamWidth = std::get<2>(info.param);
std::string name = "ExecutorTest";
name.append("_" + modelName);
if (useOrchestratorMode)
{
name.append("_OrchMode");
}
else
{
name.append("_LeaderMode");
}
name.append("_BW" + std::to_string(beamWidth));
return name;
}
std::string generateTestNameStats(testing::TestParamInfo<ParamStatsType> const& info)
{
int iterStatsMaxIterations = std::get<0>(info.param);
auto const& useOrchestratorMode = std::get<1>(info.param);
std::string name = "ExecutorTest_";
name.append(std::to_string(iterStatsMaxIterations) + "_");
if (useOrchestratorMode)
{
name.append("OrchMode");
}
else
{
name.append("LeaderMode");
}
return name;
}
std::string generateTestNameAllParams(testing::TestParamInfo<AllParamsType> const& info)
{
auto const streaming = std::get<0>(info.param);
auto const& beamWidth = std::get<1>(info.param);
auto const& computeLogProbs = std::get<2>(info.param);
auto const& excludeInputInOutput = std::get<3>(info.param);
auto const& returnContextLogits = std::get<4>(info.param);
auto const& returnGenerationLogits = std::get<5>(info.param);
auto const modelName = std::get<6>(info.param);
auto const& useOrchestratorMode = std::get<7>(info.param);
auto const& returnAllGeneratedTokens = std::get<8>(info.param);
auto const& numReturnSequences = std::get<9>(info.param);
std::string name = "ExecutorTest_";
if (streaming)
{
name += "Streaming";
}
name.append("_BW" + std::to_string(beamWidth));
name.append("Nseq" + std::to_string(numReturnSequences));
if (computeLogProbs)
{
name.append("LogProbs");
}
if (excludeInputInOutput)
{
name.append("ExcludeInput");
}
if (returnContextLogits)
{
name.append("ContextLogits");
}
if (returnGenerationLogits)
{
name.append("GenerationLogits");
}
name.append("_" + modelName + "_");
if (useOrchestratorMode)
{
name.append("OrchMode");
}
else
{
name.append("LeaderMode");
}
if (returnAllGeneratedTokens)
{
name.append("returnAllGeneratedTokens");
}
return name;
}
class ParamTest : public GptExecutorTest, public ::testing::WithParamInterface<ParamType>
{
};
class ParamStatsTest : public GptExecutorTest, public ::testing::WithParamInterface<ParamStatsType>
{
};
class AllParamsTest : public GptExecutorTest, public ::testing::WithParamInterface<AllParamsType>
{
};
class ParamCancelReqTest : public GptExecutorTest, public ::testing::WithParamInterface<ParamCancelReqType>
{
};
class LeaderApiUsageTest : public GptExecutorTest, public ::testing::WithParamInterface<LeaderApiUsageType>
{
};
class LogitsProcParamsTest : public GptExecutorTest, public ::testing::WithParamInterface<LogitsProcParamsType>
{
};
class GuidedDecodingParamsTest : public GptExecutorTest, public ::testing::WithParamInterface<GuidedDecodingParamsType>
{
};
class TimeoutTest : public GptExecutorTest, public ::testing::WithParamInterface<TimeoutTestParamsType>
{
};
TEST_F(GptExecutorTest, GetLatestStats)
{
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4};
auto request
= Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto requestId = executor.enqueueRequest(std::move(request));
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
FAIL();
}
else
{
done = response.getResult().isFinal;
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
// Expect 6 non-empty iterations
auto stats = executor.getLatestIterationStats();
EXPECT_EQ(stats.size(), 6);
uint64_t currentIter = 0;
for (auto const& stat : stats)
{
EXPECT_EQ(stat.timestamp.size(), 26);
EXPECT_EQ(stat.iter, currentIter);
if (currentIter != 5)
{
EXPECT_EQ(stat.numActiveRequests, 1);
}
else
{
// For the last iteration the number of active requests
// should be zero.
EXPECT_EQ(stat.numActiveRequests, 0);
}
EXPECT_EQ(stat.maxNumActiveRequests, 64);
// Very loose check to make sure the memory stats are valid
EXPECT_GT(stat.gpuMemUsage, 16);
EXPECT_GT(stat.cpuMemUsage, 16);
EXPECT_GT(stat.pinnedMemUsage, 16);
// Stats for KV cache
EXPECT_TRUE(stat.kvCacheStats.has_value());
KvCacheStats const& kvStats = stat.kvCacheStats.value();
EXPECT_GT(kvStats.maxNumBlocks, 0);
EXPECT_GT(kvStats.freeNumBlocks, 0);
EXPECT_EQ(kvStats.usedNumBlocks, currentIter == maxNewTokens ? 0 : 1);
EXPECT_GT(kvStats.tokensPerBlock, 0);
EXPECT_GT(kvStats.allocTotalBlocks, 0);
EXPECT_GT(kvStats.allocNewBlocks, 0);
EXPECT_GE(kvStats.reusedBlocks, 0);
EXPECT_GE(kvStats.missedBlocks, 0);
EXPECT_GE(kvStats.cacheHitRate, 0);
// Stats for inflight batching
EXPECT_TRUE(stat.inflightBatchingStats.has_value() && !stat.staticBatchingStats.has_value());
InflightBatchingStats const& modelStats = stat.inflightBatchingStats.value();
EXPECT_EQ(modelStats.numScheduledRequests, currentIter == maxNewTokens ? 0 : 1);
EXPECT_EQ(modelStats.numContextRequests, currentIter == 0 ? 1 : 0);
EXPECT_EQ(modelStats.numGenRequests, currentIter == 0 || currentIter == maxNewTokens ? 0 : 1);
EXPECT_EQ(modelStats.numPausedRequests, 0);
EXPECT_EQ(modelStats.numCtxTokens, currentIter == 0 ? inputTokens.size() : 0);
EXPECT_EQ(modelStats.microBatchId, 0);
EXPECT_NEAR(
modelStats.avgNumDecodedTokensPerIter, currentIter == 0 || currentIter == maxNewTokens ? 0.f : 1.f, 1e-9f);
auto jsonStr = JsonSerialization::toJsonStr(stat);
EXPECT_THAT(jsonStr, testing::HasSubstr("\"iter\":" + std::to_string(currentIter)));
EXPECT_THAT(jsonStr, testing::HasSubstr("\"staticBatchingStats\":null"));
EXPECT_THAT(jsonStr, testing::HasSubstr("\"numCtxTokens\":" + std::to_string(modelStats.numCtxTokens)));
EXPECT_THAT(jsonStr, testing::HasSubstr("\"numGenRequests\":" + std::to_string(modelStats.numGenRequests)));
++currentIter;
}
}
TEST_F(GptExecutorTest, GetLatestStatsWithMultipleRequests)
{
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the requests
SizeType32 const numRequests = 2;
std::vector<SizeType32> maxNewTokens{3, 5};
std::vector<VecTokens> inputTokens{{1, 2, 3, 4}, {5, 6, 7}};
std::vector<IdType> reqIds;
for (SizeType32 ireq = 0; ireq < numRequests; ++ireq)
{
auto request = Request(inputTokens[ireq], maxNewTokens[ireq], streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto requestId = executor.enqueueRequest(std::move(request));
reqIds.emplace_back(requestId);
// sleep for 10 ms before sending the next request
std::this_thread::sleep_for(std::chrono::milliseconds(20));
}
for (SizeType32 ireq = 0; ireq < numRequests; ++ireq)
{
auto requestId = reqIds[ireq];
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
FAIL();
}
else
{
done = response.getResult().isFinal;
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
}
// NOTES:
// Expect at least max(maxNewTokens) i.e. 5 non-empty iterations
// 4th iteration should have numCompletedRequests to be 1.
// Depending on the timing, first iteration will either have:
// 2 active requests
// or
// 1 active requests and 1 queued requests
auto stats = executor.getLatestIterationStats();
EXPECT_GT(stats.size(), 0); // make sure we have at least 1 stat before the accessing 0-th element
if (stats[0].numActiveRequests == 2)
{
// we cannot reliably check queue latency since both started in the same iteration
// there should be exactly 5 non-empty iterations
EXPECT_EQ(stats.size(), 5);
// only check numCompletedRequests in 4th iteration
EXPECT_EQ(stats[3].numCompletedRequests, 1);
// 1st iteration shall record all 2 requests queueing time;
EXPECT_EQ(stats[0].numNewActiveRequests, 2);
// all rest iterations shall not return any queueing time;
for (int i = 1; i < stats.size(); ++i)
{
EXPECT_EQ(stats[i].numNewActiveRequests, 0);
}
}
else
{
// there should be more than 5 non-empty iterations since 2nd request started after 1st iteration
EXPECT_GT(stats.size(), 5);
// 1st request's completion is at 4th iteration
EXPECT_EQ(stats[3].numCompletedRequests, 1);
// 1st iteration record 1 request's queueing time;
EXPECT_EQ(stats[0].numNewActiveRequests, 1);
// the iteration where 2nd request became active, queue latency must be > 0
uint64_t currentIter = 0;
for (auto const& stat : stats)
{
// To check when 2nd request becomes active, we need to think about 2 cases:
// - it overlaps with first request
// => only check queue time in this case
// - it doesn't overlap with the first request (e.g. 1st request ended too fast)
// => little to no queue time, cannot check reliably
// so we only check for queue time when numActiveRequests > 1 i.e. overlap happened after first iteration
if (stat.numActiveRequests > 1)
{
EXPECT_GT(currentIter, 0); // it must be after 1st iteration
EXPECT_GT(stat.newActiveRequestsQueueLatencyMS, 0);
// 2nd request record queueing time in this iteration
EXPECT_EQ(stat.numNewActiveRequests, 1);
break;
}
++currentIter;
}
}
}
TEST_F(GptExecutorTest, GetLatestRequestStats)
{
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setRequestStatsMaxIterations(1000);
executorConfig.setEnableChunkedContext(true);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the requests
std::vector<std::pair<SizeType32, VecTokens>> requestParams = {
// {maxNewTokens, inputTokens}
{5, {1, 2, 3, 4}}, {4, {1, 1, 2, 3, 5}}, {1, {1}},
{8, VecTokens(383, 1)} // Long enough to be chunked into multiple iterations
};
std::vector<Request> requests;
for (auto requestParam : requestParams)
{
requests.emplace_back(requestParam.second, requestParam.first, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
}
auto requestIdsVec = executor.enqueueRequests(std::move(requests));
std::map<IdType, SizeType32> requestIdToIndex;
std::set<IdType> activeRequests;
for (SizeType32 i = 0; i < requestIdsVec.size(); ++i)
{
auto requestId = requestIdsVec[i];
activeRequests.insert(requestId);
requestIdToIndex[requestId] = i;
}
int iter = 0;
while (!activeRequests.empty() && iter < mMaxWaitMs)
{
for (auto i = activeRequests.begin(); i != activeRequests.end();)
{
auto requestId = *i;
bool thisDone = false;
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
else
{
thisDone = response.getResult().isFinal;
}
}
if (thisDone)
{
// Erase completed request and move to the next one
i = activeRequests.erase(i);
}
else
{
++i;
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
// Expect 5 non-empty iterations
// Note: The 6th iteration with the last finished request will be reported
// but might be unavailable when getLatestRequestStats is called since
// it could be updated after the final response has been sent.
auto stats = executor.getLatestRequestStats();
EXPECT_GE(stats.size(), 5);
SizeType32 currentIter = 0;
auto invalidStart = std::numeric_limits<SizeType32>::max();
std::vector<SizeType32> genStart(requestParams.size(), invalidStart); // The iteration index when generation started
std::set<IdType> completedRequests;
for (auto stat = stats.begin(); stat != stats.begin() + 5; ++stat)
{
auto jsonStrIter = JsonSerialization::toJsonStr(*stat);
EXPECT_EQ(stat->iter, currentIter);
EXPECT_THAT(jsonStrIter, testing::HasSubstr("\"iter\":" + std::to_string(currentIter)));
EXPECT_EQ(stat->requestStats.size() + completedRequests.size(), requestParams.size());
for (auto rStat : stat->requestStats)
{
auto jsonStr = JsonSerialization::toJsonStr(rStat);
// Only a few requests here so all of them should be scheduled. A separate test
// GetLatestRequestStatsScheduling will target the scheduling stats.
if (rStat.stage != RequestStage::kGENERATION_COMPLETE)
{
EXPECT_TRUE(rStat.scheduled);
EXPECT_THAT(jsonStr, testing::HasSubstr("\"scheduled\":true"));
}
EXPECT_TRUE(!rStat.paused);
EXPECT_THAT(jsonStr, testing::HasSubstr("\"paused\":false"));
EXPECT_TRUE(requestIdToIndex.count(rStat.id));
EXPECT_THAT(jsonStr, testing::HasSubstr("\"id\":" + std::to_string(rStat.id)));
auto requestIndex = requestIdToIndex[rStat.id];
auto contextSize = requestParams[requestIndex].second.size();
if (rStat.contextPrefillPosition == contextSize) // Check generation phase
{
bool firstIteration{false};
// Context phase is done
EXPECT_TRUE(rStat.stage == RequestStage::kGENERATION_IN_PROGRESS
|| rStat.stage == RequestStage::kGENERATION_COMPLETE);
EXPECT_THAT(jsonStr, testing::HasSubstr("\"stage\":\"GENERATION"));
if (genStart[requestIndex] == invalidStart)
{
// Just started generation
genStart[requestIndex] = currentIter;
firstIteration = true;
}
// One token per iteration
EXPECT_TRUE(currentIter - genStart[requestIndex] == rStat.numGeneratedTokens);
EXPECT_NEAR(rStat.avgNumDecodedTokensPerIter, firstIteration ? 0.f : 1.0f, 1e-9);
if (rStat.stage == RequestStage::kGENERATION_COMPLETE)
{
EXPECT_TRUE(requestParams[requestIndex].first >= rStat.numGeneratedTokens);
completedRequests.insert(requestIndex);
}
else
{
EXPECT_FALSE(completedRequests.count(requestIndex));
}
}
else if (rStat.contextPrefillPosition < contextSize) // Check context phase
{
// Must be chunked
SizeType32 const maxChunkSize = 128;
EXPECT_TRUE(rStat.contextPrefillPosition % maxChunkSize == 0);
// Context phase is on-going
EXPECT_TRUE(rStat.stage == RequestStage::kCONTEXT_IN_PROGRESS);
// No tokens are generated
EXPECT_TRUE(0 == rStat.numGeneratedTokens);
}
else
{
FAIL() << "Out-of-boundary contextPrefillPosition in stats: " << rStat.contextPrefillPosition
<< " out of " << contextSize;
}
// Sanity check that disaggregated serving stats is not set in typical use case
EXPECT_FALSE(rStat.disServingStats.has_value());
}
++currentIter;
}
// We should have visited all requests.
// Take into consideration the last request has not been reported
EXPECT_EQ(completedRequests.size() + 1, requestParams.size());
}
TEST_F(GptExecutorTest, GetLatestRequestStatsScheduling)
{
// Specifically test the case where there are too many requests to be scheduled for a iteration
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setRequestStatsMaxIterations(1000);
executorConfig.setEnableChunkedContext(true);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create 100 requests. Note the max batch size for this model is 64 so some requests won't be scheduled right away.
std::vector<std::pair<SizeType32, VecTokens>> requestParams(100, {5, {1, 2, 3, 4}});
std::vector<Request> requests;
requests.reserve(requestParams.size());
for (auto requestParam : requestParams)
{
requests.emplace_back(requestParam.second, requestParam.first, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
}
auto requestIdsVec = executor.enqueueRequests(std::move(requests));
std::map<IdType, SizeType32> requestIdToIndex;
std::set<IdType> activeRequests;
for (SizeType32 i = 0; i < requestIdsVec.size(); ++i)
{
auto requestId = requestIdsVec[i];
activeRequests.insert(requestId);
requestIdToIndex[requestId] = i;
}
int iter = 0;
while (!activeRequests.empty() && iter < mMaxWaitMs)
{
for (auto i = activeRequests.begin(); i != activeRequests.end();)
{
auto requestId = *i;
bool thisDone = false;
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
else
{
thisDone = response.getResult().isFinal;
}
}
if (thisDone)
{
// Erase completed request and move to the next one
i = activeRequests.erase(i);
}
else
{
++i;
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
auto stats = executor.getLatestRequestStats();
SizeType32 numFinished = 0;
SizeType32 const maxActiveSize = 64; // Decided by the model
// The 6th iteration request stat may or may not be available when getLatestRequestStats
// is called. When there are no other active or inTransmission requests, there will be
// another request stats to properly reset all the statistics to zero.
for (auto stat = stats.begin(); stat != stats.begin() + 5; ++stat)
{
SizeType32 numReqs = 0;
SizeType32 numReqsActive = 0;
SizeType32 numReqsQueued = 0;
SizeType32 numReqsJustDone = 0;
for (auto rStat : stat->requestStats)
{
++numReqs;
numReqsActive += rStat.scheduled ? 1 : 0;
numReqsQueued += rStat.stage == RequestStage::kQUEUED ? 1 : 0;
numReqsJustDone += rStat.stage == RequestStage::kGENERATION_COMPLETE ? 1 : 0;
}
EXPECT_EQ(numReqs, numReqsActive + numReqsQueued + numReqsJustDone);
EXPECT_EQ(numReqs + numFinished, requestParams.size()); // Should report all unfinished requests
EXPECT_TRUE(numReqsActive <= maxActiveSize); // Not all requests are active due to max active size limit.
numFinished += numReqsJustDone;
}
}
TEST_F(GptExecutorTest, GetRequestStatsMultipleRequests)
{
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setRequestStatsMaxIterations(1000);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
auto sendRequestWaitForResponseFn = [&]()
{
Request request({1, 2, 3}, 5);
auto requestId = executor.enqueueRequest(request);
bool isFinalResponse = false;
while (!isFinalResponse)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto response : responses)
{
if (response.getResult().isFinal)
{
isFinalResponse = true;
break;
}
}
}
return requestId;
};
std::unordered_map<IdType, size_t> requestIdToGenerationComplete;
auto updateStats = [&]()
{
auto stats = executor.getLatestRequestStats();
for (auto& stat : stats)
{
for (auto const& request : stat.requestStats)
{
// only check and aggregate results when request is completed
if (request.stage == RequestStage::kGENERATION_COMPLETE)
{
requestIdToGenerationComplete[request.id] += 1;
}
}
}
};
auto requestId = sendRequestWaitForResponseFn();
requestIdToGenerationComplete[requestId] = 0;
updateStats();
requestId = sendRequestWaitForResponseFn();
requestIdToGenerationComplete[requestId] = 0;
updateStats();
for (auto [key, value] : requestIdToGenerationComplete)
{
EXPECT_EQ(value, 1);
}
}
TEST_F(GptExecutorTest, BatchSizeTuning)
{
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setRequestStatsMaxIterations(1000);
executorConfig.setEnableChunkedContext(true);
DynamicBatchConfig dynamicBatchConfig(true, false, 1); // Set window size to 1
SchedulerConfig schedulerConfig(CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::nullopt, dynamicBatchConfig);
executorConfig.setSchedulerConfig(schedulerConfig);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
std::vector<SizeType32> tunerRecommendedBatchSizes;
for (size_t i = 0; i <= 8; ++i)
{
auto inputLength = 1 << i; // Note that for this model max input len is 383
Request request(
VecTokens(inputLength, 2), 5, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto requestId = executor.enqueueRequest(std::move(request));
// Wait for current request to finish
while (true)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
bool done = false;
if (responses.size() != 0)
{
EXPECT_TRUE(responses.size() == 1);
auto response = responses[0];
EXPECT_FALSE(response.hasError());
if (response.getResult().isFinal)
{
break;
}
}
}
auto reqStats = executor.getLatestIterationStats();
EXPECT_TRUE(reqStats.size() > 0);
auto lastStat = reqStats.back();
tunerRecommendedBatchSizes.push_back(lastStat.maxBatchSizeTunerRecommended);
}
EXPECT_TRUE(tunerRecommendedBatchSizes.size() > 0);
// It's supposed to be decreasing when input length increases
EXPECT_TRUE(*tunerRecommendedBatchSizes.begin() > *tunerRecommendedBatchSizes.rbegin());
}
TEST_F(GptExecutorTest, GetLatestDebugTensors)
{
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 maxNewTokens = 5;
tensorrt_llm::executor::DebugConfig debugConfig;
debugConfig.setDebugTensorNames({{"sequence_length"}});
debugConfig.setDebugTensorsMaxIterations(maxNewTokens);
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setDebugConfig(debugConfig);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
VecTokens inputTokens{1, 2, 3, 4};
auto request
= Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto requestId = executor.enqueueRequest(request);
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
FAIL();
}
else
{
done = response.getResult().isFinal;
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
auto stream = std::make_shared<tr::CudaStream>();
// Expect 5 non-empty iterations
auto debugTensors = executor.getLatestDebugTensors();
EXPECT_EQ(debugTensors.size(), 5);
uint64_t currentIter = 0;
for (auto const& debugIteration : debugTensors)
{
EXPECT_EQ(debugIteration.iter, currentIter);
EXPECT_EQ(debugIteration.debugTensors.size(), 2);
{
auto it = debugIteration.debugTensors.find("request_ids");
EXPECT_NE(it, debugIteration.debugTensors.end());
auto const& tensor = it->second;
auto const& shape = tensor.getShape();
EXPECT_EQ(shape.size(), 1);
EXPECT_EQ(shape[0], 1);
EXPECT_EQ(tensor.getSize(), 1);
auto const* dataPtr = static_cast<SizeType32 const*>(tensor.getData());
EXPECT_EQ(dataPtr[0], 1) << "currentIter " << currentIter;
}
{
auto it = debugIteration.debugTensors.find("sequence_length");
EXPECT_NE(it, debugIteration.debugTensors.end());
auto const& tensor = it->second;
auto const& shape = tensor.getShape();
EXPECT_EQ(shape.size(), 1);
EXPECT_EQ(tensor.getSize(), 1);
auto tensorHost = tensor.copyToCpu(stream);
auto const* dataPtr = static_cast<SizeType32 const*>(tensorHost.getData());
EXPECT_EQ(dataPtr[0], inputTokens.size() + currentIter);
}
++currentIter;
}
}
TEST_P(ParamTest, SingleRequestDemo)
{
bool const streaming = std::get<0>(GetParam());
bool const excludeInputFromOutput = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4};
auto request
= Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
// Enqueue the request
auto requestId = executor.enqueueRequest(request);
// Get the new tokens
VecTokens tokens;
SizeType32 numResponses{0};
bool done = false;
int iter = 0;
std::chrono::milliseconds waitTime(1);
while (!done && iter < mMaxWaitMs)
{
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
++numResponses;
if (response.hasError())
{
// This request failed for some reason, get error msg
std::string errStr
= "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg();
FAIL();
}
auto result = response.getResult();
done = result.isFinal;
auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
auto const expectedSize = streaming ? (beamWidth > 1 ? numResponses : 1)
: (maxNewTokens + (excludeInputFromOutput ? 0 : inputTokens.size()));
EXPECT_EQ(newTokens.size(), expectedSize);
if (streaming && beamWidth > 1)
{
// replace tokens
tokens = newTokens;
}
else
{
// Append tokens
tokens.insert(tokens.end(), newTokens.begin(), newTokens.end());
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
EXPECT_EQ(numResponses, streaming ? maxNewTokens : 1);
EXPECT_EQ(
tokens.size(), streaming ? maxNewTokens : (excludeInputFromOutput ? 0 : inputTokens.size()) + maxNewTokens);
// Expect awaitResponse to return error message because the request is already terminated (isFinal = True)
auto response = executor.awaitResponses(requestId, waitTime).at(0);
EXPECT_TRUE(response.hasError());
std::string err
= "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
TEST_P(ParamTest, MultipleRequestDemo)
{
bool const streaming = std::get<0>(GetParam());
bool const excludeInputFromOutput = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 numRequests = 20;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 maxPromptLen = 20;
SizeType32 maxMaxNewTokens = 20;
SizeType32 endId = -1;
// Enqueue the requests
std::unordered_map<IdType, VecTokens> tokens;
std::unordered_map<IdType, SizeType32> expectedNumTokens;
std::unordered_map<IdType, SizeType32> expectedNumResponses;
for (SizeType32 req = 0; req < numRequests; ++req)
{
SizeType32 promptLen = rand() % maxPromptLen + 1;
SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;
auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
auto reqId = executor.enqueueRequest(std::move(request));
tokens[reqId] = {};
expectedNumTokens[reqId] = ((streaming || excludeInputFromOutput) ? 0 : promptLen) + maxNewTokens;
expectedNumResponses[reqId] = streaming ? maxNewTokens : 1;
}
// Get the new tokens for each requests
int32_t numFinished = 0;
int iter = 0;
std::unordered_map<IdType, SizeType32> numResponses;
while (numFinished < numRequests && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
auto reqId = response.getRequestId();
++numResponses[reqId];
if (!response.hasError())
{
auto result = response.getResult();
numFinished += result.isFinal;
auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
auto const expectedSize
= streaming ? (beamWidth > 1 ? numResponses[reqId] : 1) : expectedNumTokens[reqId];
EXPECT_EQ(newTokens.size(), expectedSize);
auto& reqTokens = tokens.at(response.getRequestId());
if (streaming && beamWidth > 1)
{
reqTokens = newTokens;
}
else
{
reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
}
for (SizeType32 b = 0; b < beamWidth; ++b)
{
EXPECT_EQ(result.finishReasons.at(b),
result.isFinal ? FinishReason::kLENGTH : FinishReason::kNOT_FINISHED);
}
}
else
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
// Check that number of tokens matches expectations
for (auto const& [reqId, numTokens] : expectedNumTokens)
{
EXPECT_EQ(expectedNumResponses[reqId], numResponses[reqId]) << "reqId " << reqId;
EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
}
}
TEST_P(ParamStatsTest, MultipleRequestStats)
{
bool streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 numRequests = 100;
auto iterStatsMaxIterations = std::get<0>(GetParam());
bool useOrchestratorMode = std::get<1>(GetParam());
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setIterStatsMaxIterations(iterStatsMaxIterations);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
std::optional<OrchestratorConfig> orchestratorConfig = std::nullopt;
if (useOrchestratorMode)
{
orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
}
auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt, std::nullopt,
orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 maxPromptLen = 20;
SizeType32 maxMaxNewTokens = 20;
SizeType32 endId = -1;
// Enqueue the requests
std::unordered_map<IdType, VecTokens> tokens;
std::unordered_map<IdType, SizeType32> expectedNumTokens;
for (SizeType32 req = 0; req < numRequests; ++req)
{
SizeType32 promptLen = rand() % maxPromptLen + 1;
SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;
auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
auto reqId = executor.enqueueRequest(std::move(request));
tokens[reqId] = {};
expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens;
}
std::atomic<bool> statsThreadDone = false;
std::atomic<int32_t> numFinished = 0;
std::deque<IterationStats> iterStatsReceived;
// Spawn a thread that continuously get stats
auto statsThread = std::thread(
[&executor, &numFinished, numRequests, &iterStatsReceived, &statsThreadDone]()
{
while (numFinished < numRequests)
{
auto reqStats = executor.getLatestIterationStats();
iterStatsReceived.insert(iterStatsReceived.end(), std::make_move_iterator(reqStats.begin()),
std::make_move_iterator(reqStats.end()));
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
statsThreadDone = true;
});
// Get the new tokens for each requests
int iter = 0;
SizeType32 numResponses = 0;
while (numFinished < numRequests && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
numResponses++;
if (!response.hasError())
{
auto result = response.getResult();
numFinished += result.isFinal;
auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
auto& reqTokens = tokens.at(response.getRequestId());
reqTokens.insert(reqTokens.end(), std::make_move_iterator(newTokens.begin()),
std::make_move_iterator(newTokens.end()));
}
else
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
// Check that number of tokens matches expectations
for (auto const& [reqId, numTokens] : expectedNumTokens)
{
EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
}
// Wait for stats thread to be done, fail otherwise
iter = 0;
while (!statsThreadDone && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
std::this_thread::sleep_for(std::chrono::milliseconds(waitTime));
iter++;
}
ASSERT_TRUE(statsThreadDone);
if (iterStatsMaxIterations > 0)
{
ASSERT_GT(iterStatsReceived.size(), 1);
for (auto stats : iterStatsReceived)
{
EXPECT_GT(stats.numActiveRequests, 0);
TLLM_LOG_INFO("%d %d", stats.iter, stats.numActiveRequests);
EXPECT_TRUE(stats.inflightBatchingStats.has_value());
if (stats.inflightBatchingStats.has_value())
{
EXPECT_GT(stats.inflightBatchingStats.value().numScheduledRequests, 0);
}
}
}
statsThread.join();
}
TEST_P(ParamTest, MultipleRequestBatchResponses)
{
bool const streaming = std::get<0>(GetParam());
bool const excludeInputFromOutput = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 constexpr numRequests{20};
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 constexpr maxPromptLen{20};
SizeType32 constexpr maxMaxNewTokens{20};
SizeType32 endId = -1;
// Enqueue the requests
std::unordered_map<IdType, VecTokens> tokens;
std::unordered_map<IdType, SizeType32> expectedNumTokens;
std::vector<IdType> requestIds;
for (SizeType32 req = 0; req < numRequests; ++req)
{
SizeType32 promptLen = rand() % maxPromptLen + 1;
SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;
auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
auto reqId = executor.enqueueRequest(std::move(request));
requestIds.push_back(reqId);
tokens[reqId] = {};
expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens;
}
// Get the new tokens for each requests
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
std::chrono::milliseconds waitTime(1);
while (numFinished < numRequests && iter < mMaxWaitMs)
{
auto idResponses = executor.awaitResponses(requestIds, waitTime);
for (unsigned i = 0; i < requestIds.size(); ++i)
{
auto& responses = idResponses[i];
for (auto& response : responses)
{
numResponses++;
if (!response.hasError())
{
auto result = response.getResult();
numFinished += result.isFinal;
auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
auto& reqTokens = tokens.at(response.getRequestId());
if (streaming && beamWidth > 1)
{
reqTokens = newTokens;
}
else
{
reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
}
}
else
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
// Rerun awaitResponses again and we expect to only see terminated request id error.
auto idResponses = executor.awaitResponses(requestIds, waitTime);
for (auto const& responses : idResponses)
{
for (auto& response : responses)
{
EXPECT_TRUE(response.hasError());
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
}
// Check that number of tokens matches expectations
for (auto const& [reqId, numTokens] : expectedNumTokens)
{
EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
}
}
TEST_P(ParamTest, GetNumResponsesReadyTest)
{
bool const streaming = std::get<0>(GetParam());
bool const excludeInputFromOutput = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 maxNumRequests = 50;
SizeType32 maxPromptLen = 20;
SizeType32 maxMaxNewTokens = 20;
SizeType32 numRequests = rand() % maxNumRequests + 1;
SizeType32 numExpectedResponses = 0;
std::map<IdType, SizeType32> reqNumExpectedResponses;
std::vector<IdType> ids;
for (SizeType32 req = 0; req < numRequests; ++req)
{
SizeType32 promptLen = rand() % maxPromptLen + 1;
SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;
auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto id = executor.enqueueRequest(std::move(request));
ids.emplace_back(id);
reqNumExpectedResponses[id] = streaming ? maxNewTokens : 1;
numExpectedResponses += reqNumExpectedResponses.at(id);
}
SizeType32 iter = 0;
SizeType32 numReady = 0;
while (numReady < numExpectedResponses && iter < mMaxWaitMs)
{
numReady = 0;
for (auto id : ids)
{
numReady += executor.getNumResponsesReady(id);
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
// Expect one response per request
for (auto id : ids)
{
SizeType32 numReady = executor.getNumResponsesReady(id);
EXPECT_EQ(numReady, reqNumExpectedResponses.at(id));
}
auto numResponsesReady = executor.getNumResponsesReady();
EXPECT_EQ(numResponsesReady, numExpectedResponses);
}
namespace
{
void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& modelIds,
FlakyTestInfo const& flakyTestInfo, bool streaming, SizeType32 const vocabSizePadded, BeamResult const& beamResult,
OutputConfig const& outConfig, bool isSpeculativeDecoding, int maxWaitMs, bool returnAllGeneratedTokens,
SizeType32 const numReturnSequences, bool isNonGreedySampling, SizeType32 const modelParallelism)
{
auto const beamWidth = beamResult.beamWidth;
auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
auto const* const givenInputData = tr::bufferCast<TokenIdType const>(*givenInput);
auto const& inputShape = givenInput->getShape();
ASSERT_EQ(inputShape.nbDims, 2);
ASSERT_GT(inputShape.d[0], 0);
// Load expected outputs for each beam width value
auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds);
auto const maxSeqLen = testData.maxSeqLen;
// Load expected outputs and inputs
SizeType32 numRequests = static_cast<SizeType32>(givenInputLengths.size());
SizeType32 maxRequests = numRequests;
std::vector<Request> requests;
std::vector<SizeType32> reqMaxNewTokens;
auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
// top-k will be set by a large number to test non-identical N sequences.
if (isNonGreedySampling)
{
samplingConfig.setTopK(32);
}
samplingConfig.setNumReturnSequences(numReturnSequences);
for (SizeType32 req = 0; req < maxRequests; ++req)
{
SizeType32 inputLen = givenInputLengths.at(req);
auto maxNewTokens = maxSeqLen - maxInputLength;
reqMaxNewTokens.push_back(maxNewTokens);
SizeType32 endId = -1;
auto const* const seqBegin = givenInputData + req * maxInputLength;
VecTokens tokens(seqBegin, seqBegin + inputLen);
auto request = Request(
VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming, samplingConfig, outConfig, endId);
request.setReturnAllGeneratedTokens(returnAllGeneratedTokens);
requests.emplace_back(std::move(request));
}
auto& comm = tensorrt_llm::mpi::MpiComm::world();
auto const worldRank = comm.getRank();
// Expected return sizes.
auto const numSequences = beamWidth > 1 ? 1 : numReturnSequences;
auto const numReturnBeams = std::min(beamWidth, numReturnSequences);
if (worldRank == 0)
{
auto const reqIds = executor.enqueueRequests(requests);
std::unordered_map<SizeType32, std::vector<BeamTokens>> tokens;
std::unordered_map<IdType, SizeType32> reqIdToBatchId;
for (SizeType32 req = 0; req < reqIds.size(); ++req)
{
std::vector<BeamTokens> resultTokens(numSequences, BeamTokens(numReturnBeams));
tokens[req] = std::move(resultTokens);
reqIdToBatchId[reqIds.at(req)] = req;
}
// Get the new tokens for each requests
int32_t numFinished = 0;
int iter = 0;
std::unordered_map<IdType, SizeType32> numResponses;
while (numFinished < maxRequests && iter < maxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
auto batchId = reqIdToBatchId.at(response.getRequestId());
numResponses[batchId]++;
if (!response.hasError())
{
auto result = response.getResult();
numFinished += result.isFinal;
auto seqIdx = result.sequenceIndex;
auto const& contextLogits = result.contextLogits;
auto const& genLogits = result.generationLogits;
auto const& outputTokenIds = result.outputTokenIds;
EXPECT_EQ(result.finishReasons.size(), numReturnBeams);
for (SizeType32 beam = 0; beam < numReturnBeams; ++beam)
{
auto const& newTokens = outputTokenIds.at(beam);
auto& reqTokens = tokens.at(batchId).at(seqIdx).at(beam);
if (!returnAllGeneratedTokens)
{
reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
}
else
{
EXPECT_EQ(newTokens.size(),
(numResponses.at(batchId) + numReturnSequences - 1) / numReturnSequences);
reqTokens = newTokens;
}
// FinishReason is only supported for bw=1 and inflight batching.
if (beamWidth == 1)
{
EXPECT_EQ(result.finishReasons.at(beam),
result.isSequenceFinal ? FinishReason::kLENGTH : FinishReason::kNOT_FINISHED);
}
}
auto const& cumLogProbs = result.cumLogProbs;
auto const& logProbs = result.logProbs;
auto const& beamTokens = tokens.at(batchId).at(seqIdx);
EXPECT_EQ(beamTokens.size(), numReturnBeams);
if (!isNonGreedySampling)
{
float const logitsAtol = modelParallelism > 1 ? 1e-1 : 1e-2;
float const logitsRtol = modelParallelism > 1 ? 1e-2 : 1e-3;
testData.verifyLogProbs(outConfig.returnLogProbs, streaming, outConfig.excludeInputFromOutput,
givenInputLengths.at(batchId), beamWidth, beamTokens, cumLogProbs, logProbs, batchId,
flakyTestInfo);
testData.validateContextLogits(outConfig.returnContextLogits, givenInputLengths.at(batchId),
beamWidth, contextLogits, vocabSizePadded, batchId, logitsAtol, logitsRtol);
testData.validateGenerationLogits(outConfig.returnGenerationLogits, result.isSequenceFinal,
streaming, outConfig.excludeInputFromOutput, givenInputLengths.at(batchId),
reqMaxNewTokens.at(batchId), beamWidth, beamTokens, genLogits, vocabSizePadded, batchId,
returnAllGeneratedTokens, logitsAtol, logitsRtol);
}
// Ignore first iteration as it doesn't use draft tokens
if (outConfig.returnPerfMetrics && isSpeculativeDecoding
&& result.requestPerfMetrics.value().iter > 0)
{
auto& specDecMetrics = result.requestPerfMetrics.value().speculativeDecoding;
// 4 draft tokens are used per step
EXPECT_EQ(specDecMetrics.totalDraftTokens, result.requestPerfMetrics.value().iter.value() * 4);
EXPECT_EQ(specDecMetrics.acceptanceRate,
static_cast<float>(specDecMetrics.totalAcceptedDraftTokens)
/ specDecMetrics.totalDraftTokens);
}
}
else
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
}
++iter;
}
EXPECT_LT(iter, maxWaitMs);
testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
isSpeculativeDecoding, beamWidth, numSequences, isNonGreedySampling);
}
}
void runTest(fs::path const& modelPath, ExecutorConfig const& executorConfig, fs::path const& inputPath,
ModelIds const& modelIds, FlakyTestInfo const& flakyTestInfo, bool streaming, SizeType32 const vocabSizePadded,
BeamResult const& beamResult, OutputConfig const& outConfig, bool isSpeculativeDecoding, int maxWaitMs,
bool returnAllGeneratedTokens, SizeType32 const numReturnSequences, bool isNonGreedySampling,
SizeType32 const modelParallelism)
{
auto executor = Executor{modelPath, ModelType::kDECODER_ONLY, executorConfig};
runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig,
isSpeculativeDecoding, maxWaitMs, returnAllGeneratedTokens, numReturnSequences, isNonGreedySampling,
modelParallelism);
}
ExecutorConfig createExecutorConfig(SizeType32 maxBeamWidth, bool useOrchestratorMode, bool gatherGenerationLogits,
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt,
std::optional<std::vector<SizeType32>> participantIds = std::nullopt)
{
// Note: we reduce memory fraction for cases that return context/generation logits which require more free
// memory
FloatType constexpr freeGpuMemoryFraction{0.5F};
KvCacheConfig kvCacheConfig(false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
auto executorConfig = ExecutorConfig(maxBeamWidth);
executorConfig.setKvCacheConfig(kvCacheConfig);
executorConfig.setNormalizeLogProbs(false);
executorConfig.setGatherGenerationLogits(gatherGenerationLogits);
std::optional<OrchestratorConfig> orchestratorConfig = std::nullopt;
if (useOrchestratorMode)
{
orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
}
auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::move(deviceIds),
std::move(participantIds), orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
return executorConfig;
}
} // namespace
TEST_P(AllParamsTest, TokenComparison)
{
auto const streaming = std::get<0>(GetParam());
auto const& beamWidth = std::get<1>(GetParam());
OutputConfig outConfig;
outConfig.returnLogProbs = std::get<2>(GetParam());
outConfig.excludeInputFromOutput = std::get<3>(GetParam());
outConfig.returnContextLogits = std::get<4>(GetParam());
outConfig.returnGenerationLogits = std::get<5>(GetParam());
auto const modelName = std::get<6>(GetParam());
auto const useOrchestratorMode = std::get<7>(GetParam());
auto const returnAllGeneratedTokens = std::get<8>(GetParam());
auto const numReturnSequences = std::get<9>(GetParam());
if (returnAllGeneratedTokens && !streaming)
{
GTEST_SKIP() << "Test does not support returnAllGeneratedTokens without streaming";
}
std::optional<std::vector<SizeType32>> participantIds = std::nullopt;
BeamResult beamResult{beamWidth};
ASSERT_TRUE(fs::exists(DATA_PATH));
fs::path modelPath;
// set defaults and adjust if needed by different models
fs::path inputPath = DATA_PATH / "input_tokens.npy";
ModelIds modelIds{50256, 50256};
bool isSpeculativeDecoding{false};
SizeType32 vocabSizePadded = 50257;
// NOTE: This can be used to disable checks for certain prompt batch entries
FlakyTestInfo flakyTestInfo;
if (modelName == "gpt")
{
auto const resultsPath
= GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
if (outConfig.returnContextLogits || outConfig.returnGenerationLogits)
{
modelPath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu";
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();
if (outConfig.returnLogProbs)
{
beamResult.cumLogProbsFile
= resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_CUM_LOG_PROBS_FILE();
beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_LOG_PROBS_FILE();
}
}
else
{
modelPath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_FILE();
if (outConfig.returnLogProbs)
{
beamResult.cumLogProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CUM_LOG_PROBS_FILE();
beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_LOG_PROBS_FILE();
}
}
}
else if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1"
|| modelName == "llama_tp1_pp2_cp1")
{
inputPath = DATA_PATH / LLAMA_INPUT_FILE;
modelIds.padId = LLAMA_PAD_ID;
modelIds.endId = LLAMA_END_ID;
vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;
auto const resultsPath
= LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
if (modelName == "llama_tp4_pp1_cp1")
{
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP4_PP1_FILE();
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
}
else if (modelName == "llama_tp1_pp4_cp1")
{
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP4_FILE();
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
}
else if (modelName == "llama_tp1_pp2_cp1")
{
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP2_FILE();
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp2-cp1-gpu";
}
else if (modelName == "llama_tp2_pp2_cp1")
{
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP2_PP2_FILE();
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
}
beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_TP4_PP1_FILE();
if (outConfig.returnLogProbs)
{
beamResult.cumLogProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CUM_LOG_PROBS_TP4_PP1_FILE();
beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_LOG_PROBS_TP4_PP1_FILE();
}
}
else if (modelName == "medusa")
{
TLLM_CHECK_WITH_INFO(beamWidth == 1, "Medusa does not support beam search.");
auto const resultsPath = MEDUSA_DATA_PATH / "sampling";
auto modelSpec = ModelSpec::getDefaultModelSpec()
.useMedusa()
.setInputFile("input_tokens_long.npy")
.setMaxOutputLength(128);
beamResult.resultsFile = resultsPath / modelSpec.getResultsFile();
modelPath = MEDUSA_MODEL_PATH / modelSpec.getModelPath() / "tp1-pp1-cp1-gpu";
inputPath = DATA_PATH / "input_vicuna.npy";
modelIds.padId = 2;
modelIds.endId = 2;
isSpeculativeDecoding = true;
outConfig.returnPerfMetrics = true;
}
else if (modelName == "chatglm" || modelName == "chatglm2" || modelName == "chatglm3" || modelName == "glm")
{
fs::path resultsPath;
if (modelName == "chatglm")
{
resultsPath = CHATGLM_DATA_PATH;
modelPath = CHATGLM_MODEL_PATH;
}
else if (modelName == "chatglm2")
{
resultsPath = CHATGLM2_DATA_PATH;
modelPath = CHATGLM2_MODEL_PATH;
}
else if (modelName == "chatglm3")
{
resultsPath = CHATGLM3_DATA_PATH;
modelPath = CHATGLM3_MODEL_PATH;
}
else if (modelName == "glm")
{
resultsPath = GLM_DATA_PATH;
modelPath = GLM_MODEL_PATH;
}
resultsPath /= (beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth);
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_FILE();
modelPath = modelPath / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
char versionChatglm{0};
if (size_t index = modelPath.string().find("chatglm"); index != std::string::npos)
{
versionChatglm = modelPath.string()[index + 7];
std::string const vChatglmString
= (versionChatglm == '-') ? std::string("") : std::string(1, versionChatglm);
inputPath = DATA_PATH / ("input_tokens_chatglm" + vChatglmString + "-6b.npy");
modelIds.padId = (versionChatglm == '-') ? 3 : 0;
modelIds.endId = (versionChatglm == '-') ? 130005 : 2;
}
else if (size_t index = modelPath.string().find("glm-10b"); index != std::string::npos)
{
inputPath = DATA_PATH / "input_tokens_glm-10b.npy";
modelIds.padId = 50256;
modelIds.endId = 50258;
}
if (versionChatglm != 0)
{
flakyTestInfo.batchIdBeams.insert(std::make_pair(1, 0));
}
}
else
{
TLLM_THROW("Unrecognized modelName");
}
if (streaming && beamWidth > 1)
{
GTEST_SKIP() << "Test does not support streaming with beam search";
}
// Warning: This should be the last check before running the test.
// It will initialize MPI which can take significant time.
if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1"
|| modelName == "llama_tp1_pp2_cp1")
{
// For llama model, only run for multiple GPUs
// This is detected by setting an env variable when running the test
char const* val = getenv("RUN_LLAMA_MULTI_GPU");
if (val == nullptr)
{
GTEST_SKIP() << "Skipping Llama test";
}
if (outConfig.returnContextLogits)
{
GTEST_SKIP() << "Skipping context logits tests for mpi runs";
}
// Check that it was launched with right number of MPI ranks
if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Leader mode and world size is not equal to 4";
}
if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Orchestrator mode and World size is not equal to 1";
}
}
auto decoderJsonConfig = tensorrt_llm::runtime::GptJsonConfig::parse(modelPath / "config.json");
auto const modelTP = decoderJsonConfig.getTensorParallelism();
auto const modelPP = decoderJsonConfig.getPipelineParallelism();
auto const modelParallelism = modelTP * modelPP;
int deviceCount = -1;
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
std::optional<std::vector<SizeType32>> deviceIds = std::vector<SizeType32>(modelParallelism);
for (auto i = 0; i < deviceIds->size(); i++)
{
deviceIds->at(i) = i % deviceCount;
}
if (modelName == "llama_tp1_pp2_cp1")
{
auto const& session = tensorrt_llm::mpi::MpiComm::world();
if (session.getSize() != 4)
{
FAIL() << "Llama-tp1-pp2 is intended solely for testing coexisting engines within the same MPI world,"
" which requires a session size of 4. However, the current session size is "
<< session.getSize() << " .";
}
if (session.getRank() / 2 == 0)
{
participantIds = std::vector<SizeType32>{0, 1};
deviceIds = std::vector<SizeType32>{0, 1};
}
else
{
participantIds = std::vector<SizeType32>{2, 3};
deviceIds = std::vector<SizeType32>{2, 3};
}
}
if (modelPP > 1)
{
std::reverse(deviceIds->begin(), deviceIds->end());
if (modelTP > 1)
{
for (SizeType32 ppRank = 0; ppRank < modelPP; ppRank++)
{
std::reverse(deviceIds->begin() + ppRank * modelTP, deviceIds->begin() + (ppRank + 1) * modelPP);
}
}
}
// Returning logits will bring higher latency
if (streaming && (outConfig.returnContextLogits || outConfig.returnGenerationLogits))
{
mMaxWaitMs = 20000;
}
auto executorConfig = createExecutorConfig(beamWidth, useOrchestratorMode, outConfig.returnGenerationLogits,
std::move(deviceIds), std::move(participantIds));
runTest(modelPath, executorConfig, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult,
outConfig, isSpeculativeDecoding, mMaxWaitMs, returnAllGeneratedTokens, numReturnSequences, false,
modelParallelism);
}
TEST_F(GptExecutorTest, ChangeBeamWidth)
{
SizeType32 constexpr maxBeamWidth{2};
auto executorConfig = ExecutorConfig(maxBeamWidth);
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 constexpr beamWidth1{1};
SizeType32 constexpr beamWidth2{2};
SizeType32 constexpr maxNewTokens{2};
VecTokens inputTokens{1, 2, 3, 4};
// Create requests with different beam widths
std::vector<Request> requests;
requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1));
requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1));
requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth2));
requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1));
auto requestIds = executor.enqueueRequests(requests);
int numFinished = 0;
int iter = 0;
while (numFinished < 4 && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
auto err = response.getErrorMsg();
std::cout << "err:" << err << std::endl;
FAIL() << "Should not get a response with error";
}
else
{
auto result = response.getResult();
numFinished += static_cast<int>(result.isFinal);
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
auto stats = executor.getLatestIterationStats();
uint64_t currentIter = 0;
for (auto const& stat : stats)
{
// TODO: enable this check when stats are cleaned
// EXPECT_EQ(stat.iter, currentIter);
if (stat.iter < 2)
{
// req 1 and 2 run with same beam width
EXPECT_EQ(stat.numActiveRequests, 2);
}
else if (stat.numActiveRequests != 0) // TODO: remove this check when stats are cleaned
{
// req 3 or 4 run width different beam width
EXPECT_EQ(stat.numActiveRequests, 1);
}
++currentIter;
}
}
void doTokenComparisonChangeBeamWidth(bool enableReuse, SizeType32 maxWaitMs)
{
SizeType32 constexpr maxBeamWidth{2};
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
auto constexpr streaming = false;
// Create executor config
auto kvCacheConfig = KvCacheConfig(enableReuse);
auto executorConfig = ExecutorConfig(maxBeamWidth, SchedulerConfig(), kvCacheConfig);
// Create executor
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
auto const inputPath = DATA_PATH / "input_tokens.npy";
ModelIds modelIds{50256, 50256};
OutputConfig outConfig;
FlakyTestInfo flakyTestInfo;
bool constexpr isSpeculativeDecoding{false};
for (SizeType32 beamWidth : {1, 2})
{
BeamResult beamResult{beamWidth};
auto const resultsPath
= GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();
runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig,
isSpeculativeDecoding, maxWaitMs, false, 1, false, 1);
}
}
TEST_F(GptExecutorTest, TokenComparisonChangeBeamWidth)
{
doTokenComparisonChangeBeamWidth(false, mMaxWaitMs);
}
TEST_F(GptExecutorTest, TokenComparisonChangeBeamWidthBlockReuse)
{
doTokenComparisonChangeBeamWidth(true, mMaxWaitMs);
}
TEST_F(GptExecutorTest, NReturnRandomness)
{
SizeType32 constexpr maxBeamWidth{1};
SizeType32 constexpr numReturnSequences{2};
SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
auto constexpr streaming = false;
// Create executor config
auto executorConfig = ExecutorConfig(maxBeamWidth);
// Create executor
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
auto const inputPath = DATA_PATH / "input_tokens.npy";
ModelIds modelIds{50256, 50256};
OutputConfig outConfig;
FlakyTestInfo flakyTestInfo;
bool constexpr isSpeculativeDecoding{false};
BeamResult beamResult{maxBeamWidth};
auto const resultsPath = GPT_DATA_PATH / "sampling";
beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();
runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig,
isSpeculativeDecoding, mMaxWaitMs, false, 1, true, 1);
}
TEST_F(GptExecutorTest, TimedOut)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// No requests enqueued, expect no responses
auto numResponsesReady = executor.getNumResponsesReady();
EXPECT_EQ(numResponsesReady, 0);
std::chrono::milliseconds waitTime(10);
auto responses = executor.awaitResponses(waitTime);
EXPECT_EQ(responses.size(), 0);
}
TEST_F(GptExecutorTest, MaxSeqIdleMicrosecondsError)
{
auto executorConfig = ExecutorConfig(1);
// Request will time out
executorConfig.setMaxSeqIdleMicroseconds(1);
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 constexpr maxNewTokens{5};
VecTokens inputTokens{1, 2, 3, 4};
std::vector<Request> requests;
requests.emplace_back(inputTokens, maxNewTokens, false);
auto requestIds = executor.enqueueRequests(requests);
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
auto err = response.getErrorMsg();
std::cout << "err:" << err << std::endl;
EXPECT_THAT(err, testing::HasSubstr("Unable to get batch slot for reqId"));
done = true;
}
else
{
FAIL() << "Should get a response with error";
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
}
void logitsProcessorMixedReqsTest(std::string const& modelDir, SizeType32 worldRank, SizeType32 maxWaitMs,
bool replicated, std::optional<std::vector<SizeType32>> deviceIds);
TEST_P(LogitsProcParamsTest, All)
{
auto const modelName = std::get<0>(GetParam());
auto const batched = std::get<1>(GetParam());
auto const replicated = std::get<2>(GetParam());
std::string modelDir;
int tp_size = 1, pp_size = 1, cp_size = 1;
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
if (modelName == "llama_tp1_pp1_cp1")
{
modelDir = "tp1-pp1-cp1-gpu";
}
else if (modelName == "llama_tp4_pp1_cp1")
{
modelDir = "tp4-pp1-cp1-gpu";
tp_size = 4;
}
else if (modelName == "llama_tp1_pp4_cp1")
{
modelDir = "tp1-pp4-cp1-gpu";
pp_size = 4;
deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
}
else if (modelName == "llama_tp2_pp2_cp1")
{
modelDir = "tp2-pp2-cp1-gpu";
tp_size = pp_size = 2;
deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
}
else
{
TLLM_THROW("Unrecognized modelName");
}
std::filesystem::path modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / modelDir;
auto& comm = tensorrt_llm::mpi::MpiComm::world();
auto const worldRank = comm.getRank();
auto const worldSize = comm.getSize();
if (tp_size * pp_size * cp_size != 1)
{
// Run multi GPU test only when env variable is set
char const* val = getenv("RUN_LLAMA_MULTI_GPU");
if (val == NULL)
{
GTEST_SKIP() << "Skipping multi-gpu logits post processor test";
}
if (worldSize != 4)
{
FAIL() << "Leader mode and world size is not equal to 4";
}
}
else
{
// This has no effect for single-GPU tests
if (replicated)
{
GTEST_SKIP() << "Skipping single-gpu replicated logits post processor test";
}
}
// Configuration options
bool const streaming = false;
bool excludeInputFromOutput = false;
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
SizeType32 numRequests = 20;
IdType const kClientId = 1234;
SizeType32 beamWidth = 1;
SizeType32 maxPromptLen = 20;
SizeType32 maxMaxNewTokens = 20;
SizeType32 constexpr endId{2};
SizeType32 constexpr vocabSizePadded{32000}; // llama-7b vocabSizePadded
// We just use tokenIdCalculator to generate a token_id based on request index, output position and max new tokens.
// Then LogitsPostProcessor set all other logits except the generated token_id to large negative value.
// So the output token should be the generated token by tokenIdCalculator.
auto tokenIdCalculator = [endId, vocabSizePadded](IdType req, SizeType32 pos)
{
SizeType32 tokenId = (req * 1000 + pos) % vocabSizePadded;
if (tokenId == endId)
{
tokenId = 0;
}
return tokenId;
};
std::unordered_map<IdType, VecTokens> tokens;
std::unordered_map<IdType, SizeType32> expectedNumTokens;
std::unordered_map<IdType, VecTokens> expectedOutputTokens;
// Enqueue the requests
auto enqueueRequests = [&](Executor& executor, std::optional<std::string const> logitsProcessorName,
std::optional<LogitsPostProcessor> logitsProcessor = std::nullopt)
{
tokens.clear();
expectedNumTokens.clear();
expectedOutputTokens.clear();
for (SizeType32 req = 0; req < numRequests; ++req)
{
SizeType32 promptLen = rand() % maxPromptLen + 1;
SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;
auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
request.setClientId(kClientId);
if (logitsProcessorName)
{
request.setLogitsPostProcessorName(logitsProcessorName.value());
}
else if (logitsProcessor)
{
request.setLogitsPostProcessor(logitsProcessor.value());
}
auto reqId = executor.enqueueRequest(std::move(request));
tokens[reqId] = {};
expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens;
expectedOutputTokens[reqId] = {};
if (!streaming && !excludeInputFromOutput)
{
expectedOutputTokens[reqId].resize(promptLen, 1);
}
for (SizeType32 outputPos = 0; outputPos < maxNewTokens; ++outputPos)
{
SizeType32 outputTokenId = tokenIdCalculator(reqId, outputPos + promptLen);
expectedOutputTokens[reqId].push_back(outputTokenId);
}
}
};
// Get the new tokens for each requests
auto collectResponses = [&](Executor& executor)
{
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
while (numFinished < numRequests && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
numResponses++;
if (!response.hasError())
{
EXPECT_EQ(response.getClientId().value(), kClientId);
auto result = response.getResult();
numFinished += result.isFinal;
auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
auto& reqTokens = tokens.at(response.getRequestId());
reqTokens.insert(reqTokens.end(), std::make_move_iterator(newTokens.begin()),
std::make_move_iterator(newTokens.end()));
}
else
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
};
// Check that tokens matches expectations
auto checkOutput = [&]()
{
for (auto const& [reqId, numTokens] : expectedNumTokens)
{
EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
for (SizeType32 tokenPos = 0;
tokenPos < std::min<SizeType32>(expectedNumTokens[reqId], tokens[reqId].size()); ++tokenPos)
{
EXPECT_EQ(expectedOutputTokens[reqId][tokenPos], tokens[reqId][tokenPos])
<< "reqId=" << reqId << ", tokenPos=" << tokenPos;
}
}
};
// Test non-batched logits processor
std::string const logitsProcessorName = "SelectToken";
auto logitsPostProcessorFn = [&](IdType reqId, Tensor& logits, BeamTokens const& tokens, StreamPtr const& streamPtr,
std::optional<IdType> clientId)
{
if (replicated)
{
EXPECT_TRUE(worldRank <= tp_size - 1);
}
else
{
EXPECT_TRUE(worldRank == 0);
}
EXPECT_TRUE(clientId.value() == kClientId);
SizeType32 numTokens = tokens.at(0).size();
SizeType32 pos = numTokens;
SizeType32 outputTokenId = tokenIdCalculator(reqId, pos);
auto logitsDataType = logits.getDataType();
EXPECT_TRUE(logitsDataType == DataType::kFP16 || logitsDataType == DataType::kBF16
|| logitsDataType == DataType::kFP32);
// logits has shape [draftLength + 1, reqBeamWidth, vocabSize]
auto logitsCpu = tensorrt_llm::executor::Tensor::cpu(logitsDataType, logits.getShape());
auto* dataPtr = logitsCpu.getData();
auto eltSize = logitsCpu.getSizeInBytes() / logitsCpu.getSize();
EXPECT_TRUE(eltSize == 2 || eltSize == 4);
if (eltSize == 2)
{
auto* dataPtrU16 = static_cast<uint16_t*>(dataPtr);
uint16_t hugeNegValue = logitsDataType == DataType::kFP16 ? 0xFBFF : 0xFF7F; // a huge negative value
for (size_t i = 0; i < logitsCpu.getSize(); ++i)
{
dataPtrU16[i] = hugeNegValue;
}
dataPtrU16[outputTokenId] = 0;
}
else
{
auto* dataPtrFloat = static_cast<float*>(dataPtr);
for (size_t i = 0; i < logitsCpu.getSize(); ++i)
{
dataPtrFloat[i] = -HUGE_VALF;
}
dataPtrFloat[outputTokenId] = 0.0f;
}
logits.setFrom(logitsCpu, streamPtr);
};
if (!batched)
{
auto executorConfig = ExecutorConfig(beamWidth);
LogitsPostProcessorConfig logitsProcConfig{
std::unordered_map<std::string, tensorrt_llm::executor::LogitsPostProcessor>{
{logitsProcessorName, logitsPostProcessorFn}},
std::nullopt, replicated};
executorConfig.setLogitsPostProcessorConfig(logitsProcConfig);
if (deviceIds.has_value())
{
auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
executorConfig.setParallelConfig(parallelConfig);
}
auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);
if (worldRank == 0)
{
enqueueRequests(executor, logitsProcessorName);
collectResponses(executor);
checkOutput();
if (!replicated || tp_size == 1)
{
// Dynamic logits postprocessor must be used with replicate=false or no tensor parallelism.
enqueueRequests(executor, std::nullopt, logitsPostProcessorFn);
collectResponses(executor);
checkOutput();
}
}
}
// Test batched logits processor
auto logitsPostProcessorBatchedFn
= [logitsPostProcessorFn](std::vector<IdType> const& reqIdBatch, std::vector<Tensor>& logitsBatch,
std::vector<std::reference_wrapper<BeamTokens const>> const& tokensBatch, StreamPtr const& streamPtr,
std::vector<std::optional<IdType>> const& clientIdBatch)
{
for (int sample = 0; sample < reqIdBatch.size(); sample++)
{
logitsPostProcessorFn(
reqIdBatch[sample], logitsBatch[sample], tokensBatch[sample], streamPtr, clientIdBatch[sample]);
}
};
if (batched)
{
auto batchedExecutorConfig = ExecutorConfig(beamWidth);
if (deviceIds.has_value())
{
auto parallelConfig = batchedExecutorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
batchedExecutorConfig.setParallelConfig(parallelConfig);
}
LogitsPostProcessorConfig logitsProcConfig{std::nullopt, logitsPostProcessorBatchedFn, replicated};
batchedExecutorConfig.setLogitsPostProcessorConfig(logitsProcConfig);
auto batchedExecutor = Executor(modelPath, ModelType::kDECODER_ONLY, batchedExecutorConfig);
if (worldRank == 0)
{
enqueueRequests(batchedExecutor, Request::kBatchedPostProcessorName);
collectResponses(batchedExecutor);
checkOutput();
}
}
if (!batched)
{
logitsProcessorMixedReqsTest(modelDir, worldRank, mMaxWaitMs, replicated, std::move(deviceIds));
}
}
// Test for mixing requests with and without logits processor.
void logitsProcessorMixedReqsTest(std::string const& modelDir, SizeType32 worldRank, SizeType32 maxWaitMs,
bool replicated, std::optional<std::vector<SizeType32>> deviceIds)
{
std::string const logitsProcessorName = "dummy";
auto logitsPostProcessorFn = [&](IdType reqId, Tensor& logits, BeamTokens const& tokens, StreamPtr const& streamPtr,
std::optional<IdType> clientId)
{
// Dummy callback that does not modify logits
assert(!clientId.has_value());
};
LogitsPostProcessorConfig logitsProcConfig{
std::unordered_map<std::string, tensorrt_llm::executor::LogitsPostProcessor>{
{logitsProcessorName, logitsPostProcessorFn}},
std::nullopt, replicated};
// Create executor
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
executorConfig.setLogitsPostProcessorConfig(logitsProcConfig);
if (deviceIds.has_value())
{
auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
executorConfig.setParallelConfig(parallelConfig);
}
std::filesystem::path modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / modelDir;
auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);
if (worldRank == 0)
{
SizeType32 numRequests = 2;
SizeType32 promptLen = 5;
// First request with no LP and many output tokens
auto request1 = Request(VecTokens(promptLen, 1), 25);
// Second request with LP and few output tokens
auto request2 = Request(VecTokens(promptLen, 1), 5);
request2.setLogitsPostProcessorName(logitsProcessorName);
// Enqueue requests
auto reqId1 = executor.enqueueRequest(request1);
auto reqId2 = executor.enqueueRequest(request2);
// Wait for responses
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
while (numFinished < numRequests && iter < maxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
numResponses++;
if (!response.hasError())
{
auto result = response.getResult();
numFinished += result.isFinal;
}
else
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err = "ReqId " + std::to_string(response.getRequestId())
+ " has already been processed and was terminated.";
EXPECT_EQ(response.getErrorMsg(), err);
}
}
++iter;
}
EXPECT_LT(iter, maxWaitMs);
}
}
TEST_F(GptExecutorTest, LogitsPostProcessorThrow)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
std::string const logitsProcessorName = "UnExistProcessor";
auto request
= Request(VecTokens(10, 1), 10, false, tensorrt_llm::executor::SamplingConfig(beamWidth), OutputConfig());
request.setLogitsPostProcessorName(logitsProcessorName);
EXPECT_THROW({ auto reqId = executor.enqueueRequest(std::move(request)); }, tensorrt_llm::common::TllmException);
}
static Response executeDraftRequest(Executor& executor)
{
OutputConfig outputConfig;
outputConfig.returnGenerationLogits = true;
// Create the request
SizeType32 maxNewTokens = 4;
VecTokens inputTokens{1, 2, 3, 4};
Request request{std::move(inputTokens), maxNewTokens};
request.setOutputConfig(outputConfig);
// Enqueue the request
auto requestId = executor.enqueueRequest(std::move(request));
// Wait for the response
auto responses = executor.awaitResponses(requestId);
return responses.at(0);
}
static Response executeTargetRequest(Executor& executor, Result const& draftResult)
{
// Create the request
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4};
Request request{std::move(inputTokens), maxNewTokens};
VecTokens const& outputTokenIds = draftResult.outputTokenIds.at(0);
VecTokens draftTokens(outputTokenIds.end() - 4, outputTokenIds.end());
auto const& logitsInfo = draftResult.specDecFastLogitsInfo.value();
auto logitsTensor = logitsInfo.toTensor();
ExternalDraftTokensConfig draftTokensConfig(
std::move(draftTokens), logitsTensor, std::nullopt /* acceptance threshold */, true /* fastLogits */);
request.setExternalDraftTokensConfig(draftTokensConfig);
// Enqueue the request
auto requestId = executor.enqueueRequest(std::move(request));
// Wait for the response
auto responses = executor.awaitResponses(requestId);
return responses.at(0);
}
class SpeculativeDecodingTest : public GptExecutorTest
{
};
TEST_F(SpeculativeDecodingTest, SpecDecFastLogits)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtDraftEnginePath
= GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu";
auto trtEnginePath
= GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DRAFT_TOKENS_DIR() / "tp1-pp1-cp1-gpu";
FloatType freeGpuMemoryFraction = 0.3;
auto kvCacheConfig
= KvCacheConfig(true /* enableBlockReuse */, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
executorConfig.setKvCacheConfig(kvCacheConfig);
tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE);
int const worldSize = tensorrt_llm::mpi::MpiComm::world().getSize();
ASSERT_EQ(worldSize, 3);
int const myRank = tensorrt_llm::mpi::MpiComm::world().getRank();
bool const isOrchestrator = (myRank == 0);
auto orchestratorConfig
= OrchestratorConfig(isOrchestrator, "" /* workerExecutablePath */, nullptr, false /* spawnPrcesses */);
auto parallelConfig = ParallelConfig(
CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
auto specDecConfig = SpeculativeDecodingConfig(true /* fastLogits */);
executorConfig.setSpecDecConfig(specDecConfig);
std::unique_ptr<Executor> draftExecutor;
std::unique_ptr<Executor> targetExecutor;
if (isOrchestrator)
{
auto executorConfigDraft = executorConfig;
parallelConfig.setParticipantIds({1});
executorConfigDraft.setParallelConfig(parallelConfig);
draftExecutor = std::make_unique<Executor>(trtDraftEnginePath, ModelType::kDECODER_ONLY, executorConfigDraft);
parallelConfig.setParticipantIds({2});
executorConfig.setParallelConfig(parallelConfig);
targetExecutor = std::make_unique<Executor>(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
}
else if (myRank == 1) // draft model process
{
parallelConfig.setParticipantIds({1});
parallelConfig.setDeviceIds({0});
executorConfig.setParallelConfig(parallelConfig);
executorConfig.setGatherGenerationLogits(true);
draftExecutor = std::make_unique<Executor>(trtDraftEnginePath, ModelType::kDECODER_ONLY, executorConfig);
}
else if (myRank == 2) // target model process
{
parallelConfig.setParticipantIds({2});
parallelConfig.setDeviceIds({0});
executorConfig.setParallelConfig(parallelConfig);
draftExecutor = std::make_unique<Executor>(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
}
if (isOrchestrator)
{
auto response = executeDraftRequest(*draftExecutor);
ASSERT_FALSE(response.hasError());
response = executeTargetRequest(*targetExecutor, response.getResult());
ASSERT_FALSE(response.hasError());
}
}
TEST_F(GptExecutorTest, OrchestratorMaxQueueSize)
{
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
SizeType32 maxQueueSize = 6;
ExecutorConfig executorConfig;
executorConfig.setMaxQueueSize(maxQueueSize);
auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
auto parallelConfig = ParallelConfig(
CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
SizeType32 maxNewTokens = 100;
VecTokens inputTokens{1, 2, 3, 4};
auto request = Request(inputTokens, maxNewTokens);
std::vector<IdType> requestIds;
auto numberOfRequests = maxQueueSize * 5;
requestIds.reserve(numberOfRequests);
// Enqueue more requests than the queue can manage
for (int i = 0; i < numberOfRequests; i++)
{
auto requestId = executor.enqueueRequest(request);
requestIds.emplace_back(requestId);
}
auto responseVectors = executor.awaitResponses(std::move(requestIds));
bool failedWithFullQueue = false;
for (auto& responseVector : responseVectors)
{
for (auto& response : responseVector)
{
if (response.hasError())
{
EXPECT_THAT(response.getErrorMsg(),
testing::HasSubstr("Maximum queue size of 6 has been reached, please try again later"));
failedWithFullQueue = true;
}
}
}
EXPECT_TRUE(failedWithFullQueue) << "Expected requests to fail due to maximum queue size reached";
// Wait for requests to get scheduled to free up space in queue
std::this_thread::sleep_for(std::chrono::milliseconds(maxQueueSize * 200));
auto requestId = executor.enqueueRequest(std::move(request));
auto responses = executor.awaitResponses(requestId);
for (auto& response : responses)
{
EXPECT_FALSE(response.hasError());
}
}
TEST_F(GptExecutorTest, SingleRequestInvalidInputs)
{
bool streaming = true;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4};
std::vector<std::string> expectedErrMsgs;
std::vector<Request> requests;
// Invalid embedding bias shape
{
requests.emplace_back(inputTokens, maxNewTokens, streaming);
auto embeddingBias = Tensor::cpu(DataType::kFP32, {1});
requests.back().setEmbeddingBias(embeddingBias);
expectedErrMsgs.emplace_back("embedding bias shape is not as expected");
}
for (auto req = 0; req < requests.size(); ++req)
{
auto& request = requests.at(req);
auto const& expectedErrMsg = expectedErrMsgs.at(req);
auto requestId = executor.enqueueRequest(std::move(request));
// Try to get the new tokens
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
auto err = response.getErrorMsg();
EXPECT_THAT(err, testing::HasSubstr(expectedErrMsg));
done = true;
}
else
{
FAIL() << "Expected an err: " << expectedErrMsg;
}
}
++iter;
}
EXPECT_EQ(done, true);
}
}
TEST_F(GptExecutorTest, ExecutorKVCacheManager)
{
bool streaming = true;
int numRequests = 3;
SizeType32 beamWidth = 1;
SizeType32 maxNewTokens = 5;
auto executorConfig = ExecutorConfig(beamWidth);
auto kvCacheConfig = KvCacheConfig(true, 128);
kvCacheConfig.setEventBufferMaxSize(1024);
executorConfig.setKvCacheConfig(kvCacheConfig);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
auto kvCacheManager = *executor.getKVCacheEventManager();
// Created event should be available before any requests.
auto events = kvCacheManager->getLatestEvents(std::chrono::seconds(1));
EXPECT_EQ(events.size(), 1);
EXPECT_TRUE(std::holds_alternative<KVCacheCreatedData>(events.front().data));
// Create requests
std::vector<Request> requests;
for (int request = 0; request < 3; request++)
{
VecTokens inputTokens;
for (int i = 0; i < 63; i++)
{
inputTokens.emplace_back(i + request);
}
requests.emplace_back(inputTokens, maxNewTokens, streaming);
}
for (auto req = 0; req < requests.size(); ++req)
{
auto& request = requests.at(req);
auto requestId = executor.enqueueRequest(std::move(request));
// Get the new tokens
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
// This request failed for some reason, get error msg
std::string errStr
= "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg();
FAIL();
}
else
{
auto result = response.getResult();
done = result.isFinal;
if (done)
{
std::this_thread::sleep_for(std::chrono::milliseconds(100));
auto events = kvCacheManager->getLatestEvents(std::chrono::milliseconds(100));
if (req == 0)
{
EXPECT_EQ(events.size(), 2);
// Store the first context block
EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).parentHash, std::nullopt);
EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks.size(), 1);
// Store the second (now completed) context block and the partial decode block.
EXPECT_EQ(std::get<KVCacheStoredData>(events.back().data).blocks.size(), 2);
EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks[0].blockHash,
std::get<KVCacheStoredData>(events.back().data).parentHash);
}
else
{
EXPECT_EQ(events.size(), 4);
// Remove a block to make room for the second context block. On the second request, we need
// to remove 2 blocks.
EXPECT_EQ(std::get<KVCacheRemovedData>(events.front().data).blockHashes.size(), req);
events.pop_front();
// Store the first filled context block
EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks.size(), 1);
events.pop_front();
// Remove a block for the decode phase
EXPECT_EQ(std::get<KVCacheRemovedData>(events.front().data).blockHashes.size(), 1);
events.pop_front();
// Store the final context block and the decode block
EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks.size(), 2);
}
}
}
}
iter++;
}
EXPECT_EQ(done, true);
}
}
TEST_F(GptExecutorTest, SingleRequestLora)
{
bool streaming = true;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Load lora weights, config
auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
auto loraWeightsTensor
= std::shared_ptr(tr::utils::loadNpy(manager, LORA_WEIGHTS_FILE.string(), tr::MemoryType::kCPU));
auto loraConfigTensor
= std::shared_ptr(tr::utils::loadNpy(manager, LORA_CONFIG_FILE.string(), tr::MemoryType::kCPU));
// Create the request
SizeType32 maxNewTokens = 5;
VecTokens inputTokens{1, 2, 3, 4};
auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig());
auto loraConfig = LoraConfig(0, detail::ofITensor(loraWeightsTensor), detail::ofITensor(loraConfigTensor));
request.setLoraConfig(loraConfig);
// Enqueue the request
auto requestId = executor.enqueueRequest(std::move(request));
// Get the new tokens
VecTokens tokens;
bool done = false;
int iter = 0;
std::chrono::milliseconds waitTime(1);
while (!done && iter < mMaxWaitMs)
{
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
// This request failed for some reason, get error msg
std::string errStr
= "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg();
FAIL();
}
else
{
auto result = response.getResult();
done = result.isFinal;
// Append tokens
auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
tokens.insert(
tokens.end(), std::make_move_iterator(newTokens.begin()), std::make_move_iterator(newTokens.end()));
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
EXPECT_EQ(tokens.size(), maxNewTokens);
}
TEST_P(GuidedDecodingParamsTest, All)
{
auto const modelName = std::get<0>(GetParam());
std::filesystem::path enginePath;
std::filesystem::path tokenizerInfoPath;
int tp_size = 1, pp_size = 1, cp_size = 1;
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
if (modelName == "gpt")
{
enginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
tokenizerInfoPath = GPT_XGRAMMAR_TOKENIZER_INFO_PATH;
}
else if (modelName == "llama_tp1_pp1_cp1")
{
enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
}
else if (modelName == "llama_tp4_pp1_cp1")
{
enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
tp_size = 4;
}
else if (modelName == "llama_tp1_pp4_cp1")
{
enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
pp_size = 4;
deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
}
else if (modelName == "llama_tp2_pp2_cp1")
{
enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
tp_size = 2;
pp_size = 2;
deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
}
else
{
TLLM_THROW("Unrecognized modelName");
}
auto& comm = tensorrt_llm::mpi::MpiComm::world();
auto const worldRank = comm.getRank();
auto const worldSize = comm.getSize();
if (tp_size * pp_size * cp_size > 1)
{
// Run multi GPU test only when env variable is set
char const* val = getenv("RUN_LLAMA_MULTI_GPU");
if (val == NULL)
{
GTEST_SKIP() << "Skipping multi-gpu guided decoding test";
}
else
{
if (worldSize != 4)
{
FAIL() << "Leader mode and world size is not equal to 4";
}
}
}
bool streaming = false;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto const tokenizerInfo = nlohmann::json::parse(std::ifstream{tokenizerInfoPath});
auto const encodedVocab = tokenizerInfo["encoded_vocab"].template get<std::vector<std::string>>();
auto const tokenizerStr = tokenizerInfo["tokenizer_str"].template get<std::string>();
auto const stopTokenIds = tokenizerInfo["stop_token_ids"].template get<std::vector<TokenIdType>>();
GuidedDecodingConfig guidedDecodingConfig(
GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR, encodedVocab, tokenizerStr, stopTokenIds);
executorConfig.setGuidedDecodingConfig(guidedDecodingConfig);
if (deviceIds.has_value())
{
auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
executorConfig.setParallelConfig(parallelConfig);
}
auto executor = Executor(enginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the requests
VecTokens inputTokens;
if (modelName == "gpt")
{
inputTokens = {2061, 318, 352, 10, 16, 30, 23998, 39559, 287, 257, 8633, 287, 33918, 5794, 25, 220};
}
else // llama
{
inputTokens = {
128000, 62, 3923, 7037, 62, 16, 10, 16, 30, 62, 16533, 87710, 1265, 4404, 5356, 1265, 9643, 9132, 25, 62};
}
SizeType32 maxNewTokens = 10;
SamplingConfig samplingConfig{};
OutputConfig outputConfig{false, false, false, true};
std::vector<Request> requests;
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON));
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
std::string jsonSchema{
R"({"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"})"};
requests.back().setGuidedDecodingParams(
GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON_SCHEMA, jsonSchema));
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
std::string regex{R"(\d+)"};
requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kREGEX, regex));
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
std::string ebnfGrammar{R"(root ::= [0-9]+)"};
requests.back().setGuidedDecodingParams(
GuidedDecodingParams(GuidedDecodingParams::GuideType::kEBNF_GRAMMAR, ebnfGrammar));
std::vector<VecTokens> expectedOutputTokens;
if (modelName == "gpt")
{
expectedOutputTokens.push_back({1849, 7, 16, 10, 16, 8, 198, 16, 10, 16});
expectedOutputTokens.push_back({90, 366, 3672, 1298, 366, 7554, 31780, 1600, 366, 12888});
expectedOutputTokens.push_back({90, 366, 64, 77, 2032, 68, 81, 1, 1058, 352});
expectedOutputTokens.push_back({25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645});
expectedOutputTokens.push_back({25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645});
}
else // llama
{
expectedOutputTokens.push_back({16, 10, 16, 28, 17, 198, 62, 3923, 7037, 62});
expectedOutputTokens.push_back({5018, 16, 794, 330, 16, 498, 330, 17, 794, 330});
expectedOutputTokens.push_back({5018, 9399, 794, 16, 92});
expectedOutputTokens.push_back({16});
expectedOutputTokens.push_back({16});
}
if (executor.canEnqueueRequests())
{
// Enqueue the requests
auto reqIds = executor.enqueueRequests(std::move(requests));
// Get the responses
int numFinished = 0;
int iter = 0;
while (numFinished < 5 && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
auto reqId = response.getRequestId();
if (response.hasError())
{
// This request failed for some reason, get error msg
std::string errStr
= "Request id " + std::to_string(reqId) + " failed with err " + response.getErrorMsg();
FAIL();
}
else
{
auto result = response.getResult();
auto& newTokens = result.outputTokenIds.at(0);
int reqIdx = std::find(reqIds.begin(), reqIds.end(), reqId) - reqIds.begin();
EXPECT_THAT(newTokens, ::testing::ElementsAreArray(expectedOutputTokens[reqIdx]));
}
numFinished++;
}
}
EXPECT_LT(iter, mMaxWaitMs);
EXPECT_EQ(numFinished, 5);
}
}
TEST_F(GptExecutorTest, GuidedDecodingFailure)
{
bool streaming = false;
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
std::vector<int> stopTokenIds{50256};
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the requests
SizeType32 maxNewTokens = 10;
SamplingConfig samplingConfig{};
OutputConfig outputConfig{false, false, false, true};
VecTokens inputTokens{2061, 318, 352, 10, 16, 30, 23998, 39559, 287, 257, 8633, 287, 33918, 5794, 25, 220};
std::vector<Request> requests;
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON));
// Enqueue the requests
auto reqIds = executor.enqueueRequests(std::move(requests));
// Get the responses
int numFinished = 0;
int iter = 0;
while (numFinished < 2 && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
auto reqId = response.getRequestId();
int reqIdx = std::find(reqIds.begin(), reqIds.end(), reqId) - reqIds.begin();
if (reqIdx == 0)
{
EXPECT_FALSE(response.hasError());
}
else
{
EXPECT_TRUE(response.hasError());
}
numFinished++;
}
}
EXPECT_LT(iter, mMaxWaitMs);
EXPECT_EQ(numFinished, 2);
}
TEST_P(ParamTest, SingleRequestCancelRequest)
{
bool const streaming = std::get<0>(GetParam());
bool const excludeInputFromOutput = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
OutputConfig outConfig;
outConfig.excludeInputFromOutput = excludeInputFromOutput;
auto executorConfig = ExecutorConfig(beamWidth);
auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
SizeType32 maxNewTokens = 300;
VecTokens inputTokens{1, 2, 3, 4};
auto request
= Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto requestId = executor.enqueueRequest(std::move(request));
std::this_thread::sleep_for(std::chrono::milliseconds(100));
executor.cancelRequest(requestId);
// Try to get the new tokens
bool done = false;
int iter = 0;
VecTokens tokens;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(requestId, waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
FAIL() << "Did not expect errors";
}
else
{
auto result = response.getResult();
done = result.isFinal;
// Append tokens
auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
if (done)
{
for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
{
EXPECT_EQ(result.finishReasons[beamIdx], FinishReason::kCANCELLED);
}
}
if (streaming && beamWidth > 1)
{
tokens = newTokens;
}
else
{
tokens.insert(tokens.end(), newTokens.begin(), newTokens.end());
}
}
}
++iter;
}
EXPECT_EQ(done, true);
EXPECT_LT(iter, mMaxWaitMs);
auto expectedNumTokens
= streaming ? maxNewTokens : (excludeInputFromOutput ? 0 : inputTokens.size()) + maxNewTokens;
TLLM_LOG_INFO("num tokens: %d, expected %d", tokens.size(), expectedNumTokens);
EXPECT_LT(tokens.size(), expectedNumTokens);
}
TEST_F(GptExecutorTest, orchModeFetchNewReqErr)
{
SizeType32 beamWidth = 1;
auto executorConfig = ExecutorConfig(beamWidth);
auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
auto parallelConfig = ParallelConfig(
CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Create a req with invalid parameters
SizeType32 maxNewTokens = 5;
// Create very long prompt which should result in error during request validate
VecTokens inputTokens(10000000);
auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
auto requestId = executor.enqueueRequest(request);
auto requestId2 = executor.enqueueRequest(request);
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
auto err = response.getErrorMsg();
EXPECT_THAT(err, testing::HasSubstr("exceeds maximum input length"));
EXPECT_THAT(err, testing::HasSubstr("Encountered an error when fetching new request:"));
done = true;
}
else
{
FAIL() << "Should get a response with error";
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
}
TEST_F(GptExecutorTest, orchModeForwardError)
{
SizeType32 constexpr maxBeamWidth{1};
auto executorConfig = ExecutorConfig(maxBeamWidth);
auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
auto parallelConfig = ParallelConfig(
CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
// Setting request beam width to 2 which should cause failure
SizeType32 constexpr beamWidth{2};
SizeType32 constexpr maxNewTokens{5};
VecTokens inputTokens{1, 2, 3, 4};
auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
auto requestId = executor.enqueueRequest(request);
auto requestId2 = executor.enqueueRequest(request);
bool done = false;
int iter = 0;
while (!done && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
if (response.hasError())
{
auto err = response.getErrorMsg();
std::cout << "err:" << err << std::endl;
EXPECT_THAT(
err, testing::HasSubstr("Requested beam width 2 is larger than configured max beam width 1"));
done = true;
}
else
{
FAIL() << "Should get a response with error";
}
}
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
}
TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
{
auto const useOrchestratorMode = std::get<0>(GetParam());
auto const beamWidth = std::get<1>(GetParam());
auto const modelName = std::get<2>(GetParam());
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
OutputConfig outConfig;
auto executorConfig = ExecutorConfig(beamWidth);
std::filesystem::path modelPath;
if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
{
if (modelName == "llama_tp4_pp1_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
}
else if (modelName == "llama_tp1_pp4_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
}
else if (modelName == "llama_tp2_pp2_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
}
}
// For llama model, only run for multiple GPUs
// This is detected by setting an env variable when running the test
char const* val = getenv("RUN_LLAMA_MULTI_GPU");
if (val == NULL)
{
GTEST_SKIP() << "Skipping Llama test";
}
else
{
// Check that it was launched with right number of MPI ranks
if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Leader mode and world size is not equal to 4";
}
else if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Orchestrator mode and World size is not equal to 1";
}
}
if (useOrchestratorMode)
{
auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt,
std::nullopt, orchestratorConfig);
if (deviceIds.has_value())
{
parallelConfig.setDeviceIds(deviceIds.value());
}
executorConfig.setParallelConfig(parallelConfig);
}
else
{
if (deviceIds.has_value())
{
auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
executorConfig.setParallelConfig(parallelConfig);
}
}
auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);
// Create the request
SizeType32 maxNewTokens = 50;
VecTokens inputTokens{1, 2, 3, 4};
std::vector<Request> requests;
for (auto streaming : {false, true})
{
// Add two requests with numReturnSequences = 1
auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
// Add a request with numReturnSequences > 1
auto samplingConfig2 = tensorrt_llm::executor::SamplingConfig(beamWidth);
auto constexpr numReturnSequences = 2;
samplingConfig2.setNumReturnSequences(numReturnSequences);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig2, outConfig);
}
std::vector<bool> cancelRequests{true, false, true, true, false, true};
if (executor.canEnqueueRequests())
{
auto const requestIds = executor.enqueueRequests(requests);
// Cancel the first and third requests
std::this_thread::sleep_for(std::chrono::milliseconds(50));
for (SizeType32 i = 0; i < requests.size(); i++)
{
if (cancelRequests.at(i))
{
executor.cancelRequest(requestIds.at(i));
}
}
std::unordered_map<IdType, bool> isStreaming;
std::unordered_map<IdType, SizeType32> expectedNumTokens;
SizeType32 expectedNumResponses = 0;
for (SizeType32 i = 0; i < requests.size(); i++)
{
auto const& request = requests.at(i);
auto requestId = requestIds.at(i);
isStreaming[requestId] = request.getStreaming();
expectedNumTokens[requestId] = (request.getStreaming() ? 0 : inputTokens.size()) + maxNewTokens;
auto const numResponses = request.getStreaming() ? expectedNumTokens[requestId] : 1;
auto const numReturnSequences = request.getSamplingConfig().getBeamWidth() > 1
? 1
: request.getSamplingConfig().getNumReturnSequences().value_or(1);
expectedNumResponses += numResponses * numReturnSequences;
}
std::unordered_map<IdType, std::unordered_map<SizeType32, VecTokens>> tokens;
// Get the new tokens for each requests
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
while (numFinished < requests.size() && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
numResponses++;
if (!response.hasError())
{
auto requestId = response.getRequestId();
auto result = response.getResult();
numFinished += result.isFinal;
auto seqIdx = result.sequenceIndex;
auto numSequences = result.outputTokenIds.size();
auto& newTokens = result.outputTokenIds.at(numSequences - 1);
auto& reqResults = tokens[response.getRequestId()];
auto& reqTokens = reqResults[seqIdx];
if (isStreaming.at(requestId) && beamWidth > 1)
{
reqTokens = newTokens;
}
else
{
reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
}
}
else
{
FAIL() << "Did not expect errors";
}
}
++iter;
}
EXPECT_LE(numResponses, expectedNumResponses);
EXPECT_EQ(numFinished, requests.size());
EXPECT_LT(iter, mMaxWaitMs);
for (auto requestIdx = 0; requestIdx < requests.size(); requestIdx++)
{
auto const requestId = requestIds.at(requestIdx);
for (auto seqIdx = 0; seqIdx < tokens.at(requestId).size(); seqIdx++)
{
auto const& seqTokens = tokens.at(requestId).at(seqIdx);
if (cancelRequests.at(requestIdx))
{
EXPECT_LT(seqTokens.size(), expectedNumTokens.at(requestId));
}
else
{
EXPECT_EQ(seqTokens.size(), expectedNumTokens.at(requestId));
}
}
}
}
}
TEST_P(LeaderApiUsageTest, LeaderModeTest)
{
auto const modelName = std::get<0>(GetParam());
SizeType32 beamWidth = 2;
OutputConfig outConfig;
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
auto executorConfig = ExecutorConfig(beamWidth);
std::filesystem::path modelPath;
if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
{
if (modelName == "llama_tp4_pp1_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
}
else if (modelName == "llama_tp1_pp4_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
}
else if (modelName == "llama_tp2_pp2_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
}
}
// For llama model, only run for multiple GPUs
// This is detected by setting an env variable when running the test
char const* val = getenv("RUN_LLAMA_MULTI_GPU");
if (val == NULL)
{
GTEST_SKIP() << "Skipping Llama test";
}
else
{
// Check that it was launched with right number of MPI ranks
if (COMM_SESSION.getSize() != 4)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Leader mode and world size is not equal to 4";
}
}
if (deviceIds.has_value())
{
auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
executorConfig.setParallelConfig(parallelConfig);
}
auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);
// Since this is leader mode, all ranks should participate
EXPECT_TRUE(executor.isParticipant());
// Create the request
SizeType32 maxNewTokens = 50;
VecTokens inputTokens{1, 2, 3, 4};
auto request
= Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto requestStreaming
= Request(inputTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
// Leader enqueues requests and wait for responses
if (executor.canEnqueueRequests())
{
auto requestId = executor.enqueueRequest(request);
auto requestId2 = executor.enqueueRequest(request);
auto requestId3 = executor.enqueueRequest(requestStreaming);
auto requestId4 = executor.enqueueRequest(requestStreaming);
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
while (numFinished < 4 && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
numResponses++;
if (!response.hasError())
{
auto result = response.getResult();
numFinished += result.isFinal;
}
else
{
FAIL() << "Did not expect errors";
}
}
++iter;
}
EXPECT_EQ(numFinished, 4);
EXPECT_LT(iter, mMaxWaitMs);
}
else
{
// Check that non-leader cannot enqueue requests
EXPECT_THROW({ auto reqId = executor.enqueueRequest(request); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ auto responses = executor.awaitResponses(); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ auto numResp = executor.getNumResponsesReady(); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ executor.cancelRequest(1); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ auto stats = executor.getLatestIterationStats(); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ auto stats = executor.getLatestRequestStats(); }, tensorrt_llm::common::TllmException);
}
}
TEST_F(GptExecutorTest, validateParallelConfig)
{
auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
{
auto executorConfig = ExecutorConfig();
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
}
{
std::string expectedErrMsg = "OrchestratorConfig must be set";
try
{
auto executorConfig = ExecutorConfig();
auto parallelConfig = ParallelConfig(CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR);
executorConfig.setParallelConfig(parallelConfig);
auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
FAIL() << "Expected TllmException";
}
catch (tc::TllmException& e)
{
EXPECT_THAT(e.what(), testing::HasSubstr(expectedErrMsg));
}
catch (std::exception const& e)
{
FAIL() << "Expected TllmException";
}
}
}
TEST_P(TimeoutTest, TimeoutStreamingTest)
{
auto const modelName = std::get<0>(GetParam());
auto const useOrchestratorMode = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
auto executorConfig = ExecutorConfig(beamWidth);
std::filesystem::path modelPath;
bool isMultiGpu{false};
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
{
isMultiGpu = true;
if (modelName == "llama_tp4_pp1_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
}
else if (modelName == "llama_tp1_pp4_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
}
else if (modelName == "llama_tp2_pp2_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
}
}
if (modelName == "llama_tp1_pp1_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
}
// For llama model, only run for multiple GPUs
// This is detected by setting an env variable when running the test
char const* val = getenv("RUN_LLAMA_MULTI_GPU");
if (val == NULL && isMultiGpu)
{
GTEST_SKIP() << "Skipping MultiGpu tests";
}
if (val != NULL && !isMultiGpu)
{
GTEST_SKIP() << "Skipping SingleGpu tests";
}
if (val != NULL && isMultiGpu)
{
// Check that it was launched with right number of MPI ranks
if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Leader mode and world size is not equal to 4";
}
if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Orchestrator mode and World size is not equal to 1";
}
}
if (useOrchestratorMode)
{
auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt,
std::nullopt, orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
if (deviceIds.has_value())
{
parallelConfig.setDeviceIds(deviceIds.value());
}
executorConfig.setParallelConfig(parallelConfig);
}
else
{
if (deviceIds.has_value())
{
auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
executorConfig.setParallelConfig(parallelConfig);
}
}
auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 constexpr maxNewTokens = 10;
// create 1 request that times out immediately
// momentarily we don't cancel requests before forwardAsync so it will get scheduled for at least 1 forward
VecTokens immediateCancelTokens{1, 2, 3, 4};
auto immediateCancelRequest
= Request(immediateCancelTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth));
immediateCancelRequest.setReturnAllGeneratedTokens(true);
immediateCancelRequest.setAllottedTimeMs(std::chrono::milliseconds(0));
SizeType32 constexpr immediateCancelMinLength = 0;
SizeType32 constexpr immediateCancelMaxLength = 1;
// create 1 request that times out during the first forward
VecTokens oneForwardTokens{11, 12, 13, 14};
auto oneForwardRequest
= Request(oneForwardTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth));
oneForwardRequest.setReturnAllGeneratedTokens(true);
oneForwardRequest.setAllottedTimeMs(std::chrono::milliseconds(1));
SizeType32 constexpr oneForwardlMinLength = 0;
SizeType32 constexpr oneForwardlMaxLength = 1;
// Create the request that finishes by the number of tokens
VecTokens finishedTokens{101, 102, 103, 104};
auto finishedRequest
= Request(finishedTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth));
finishedRequest.setReturnAllGeneratedTokens(true);
finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(5000));
SizeType32 constexpr finishedMinLength = 5;
SizeType32 constexpr finishedMaxLength = maxNewTokens;
std::vector<FinishReason> referenceFinishReasons
= {FinishReason::kTIMED_OUT, FinishReason::kTIMED_OUT, FinishReason::kLENGTH};
std::vector<SizeType32> minLengths = {immediateCancelMinLength, oneForwardlMinLength, finishedMinLength};
std::vector<SizeType32> maxLengths = {immediateCancelMaxLength, oneForwardlMaxLength, finishedMaxLength};
// workaround because the last response will be empty, but we want to have at least *some* responses surpass the
// minLength
std::vector<SizeType32> achievedLength = {0, 0, 0};
SizeType32 itNr{0};
if (executor.canEnqueueRequests())
{
std::vector<Request> requests = {immediateCancelRequest, oneForwardRequest, finishedRequest};
auto requestIds = executor.enqueueRequests(requests);
auto numFinished = 0;
while (numFinished < static_cast<SizeType32>(requests.size()))
{
itNr++;
std::chrono::milliseconds waitTime(mMaxWaitMs);
auto responses = executor.awaitResponses(requestIds, waitTime);
for (auto const& response : responses)
{
for (auto const& responseIt : response)
{
auto const reqId = responseIt.getRequestId();
if (responseIt.hasError())
{
// Allow response with error only if awaitResponse processed a terminated request id
std::string err
= "ReqId " + std::to_string(reqId) + " has already been processed and was terminated.";
if (responseIt.getErrorMsg() != err)
{
TLLM_THROW("Request id %lu encountered error: %s", reqId, responseIt.getErrorMsg().c_str());
}
continue;
}
auto const& result = responseIt.getResult();
if (result.isFinal)
{
requestIds.erase(std::remove(requestIds.begin(), requestIds.end(), reqId), requestIds.end());
numFinished++;
}
auto const finishReason = result.finishReasons;
auto const actualResponse = result.outputTokenIds;
TLLM_LOG_DEBUG("reqId %d finished %d", reqId, result.isFinal);
TLLM_LOG_DEBUG("actual response:");
for (auto const& beam : actualResponse)
{
std::string tokenStr;
for (auto tok : beam)
{
tokenStr += std::to_string(tok) + " ";
}
TLLM_LOG_DEBUG("%s", tokenStr.c_str());
}
TLLM_LOG_DEBUG(
"beams' length must be in range [%d, %d]", minLengths[reqId - 1], maxLengths[reqId - 1]);
if (result.isFinal)
{
TLLM_LOG_DEBUG("finishReason");
std::string reasonStr;
for (auto const reason : finishReason)
{
// cast for easier visibility during debugging
EXPECT_EQ(static_cast<int>(reason), static_cast<int>(referenceFinishReasons[reqId - 1]));
reasonStr += std::to_string(static_cast<int>(reason)) + " ";
}
TLLM_LOG_DEBUG("%s", reasonStr.c_str());
}
EXPECT_EQ(beamWidth, actualResponse.size());
for (int beam = 0; beam < beamWidth; beam++)
{
EXPECT_LE(actualResponse.at(beam).size(), maxLengths[reqId - 1]) << "for request " << reqId;
achievedLength[reqId - 1] = std::max(
achievedLength[reqId - 1], static_cast<SizeType32>(actualResponse.at(beam).size()));
}
}
}
}
for (int reqIt = 0; reqIt < achievedLength.size(); ++reqIt)
{
EXPECT_GE(achievedLength[reqIt], minLengths[reqIt])
<< "request " << reqIt + 1 << " has not achieved min lengths";
}
}
}
TEST_P(TimeoutTest, TimeoutNonstreamingTest)
{
auto const modelName = std::get<0>(GetParam());
auto const useOrchestratorMode = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
auto executorConfig = ExecutorConfig(beamWidth);
std::filesystem::path modelPath;
bool isMultiGpu{false};
if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
{
isMultiGpu = true;
if (modelName == "llama_tp4_pp1_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
}
else if (modelName == "llama_tp1_pp4_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
}
else if (modelName == "llama_tp2_pp2_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
}
}
if (modelName == "llama_tp1_pp1_cp1")
{
modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
}
// For llama model, only run for multiple GPUs
// This is detected by setting an env variable when running the test
char const* val = getenv("RUN_LLAMA_MULTI_GPU");
if (val == NULL && isMultiGpu)
{
GTEST_SKIP() << "Skipping MultiGpu tests";
}
if (val != NULL && !isMultiGpu)
{
GTEST_SKIP() << "Skipping SingleGpu tests";
}
if (val != NULL && isMultiGpu)
{
// Check that it was launched with right number of MPI ranks
if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Leader mode and world size is not equal to 4";
}
if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
{
// No orchestrator, need worldSize to match TP*PP
FAIL() << "Orchestrator mode and World size is not equal to 1";
}
}
if (useOrchestratorMode)
{
auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt,
std::nullopt, orchestratorConfig);
executorConfig.setParallelConfig(parallelConfig);
if (deviceIds.has_value())
{
parallelConfig.setDeviceIds(deviceIds.value());
}
executorConfig.setParallelConfig(parallelConfig);
}
else
{
if (deviceIds.has_value())
{
auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
parallelConfig.setDeviceIds(deviceIds.value());
executorConfig.setParallelConfig(parallelConfig);
}
}
auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);
SizeType32 constexpr maxNewTokens = 5;
// create 1 request that times out immediately
// momentarily we don't cancel requests before forwardAsync so it will get scheduled for at least 1 forward
VecTokens immediateCancelTokens{1, 2, 3, 4};
auto immediateCancelRequest
= Request(immediateCancelTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
immediateCancelRequest.setAllottedTimeMs(std::chrono::milliseconds(0));
std::vector<std::vector<int>> immediateCancelResponse = {immediateCancelTokens, immediateCancelTokens};
// create 1 request that times out during the first forward
VecTokens oneForwardTokens{11, 12, 13, 14};
auto oneForwardRequest
= Request(oneForwardTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
oneForwardRequest.setAllottedTimeMs(std::chrono::milliseconds(1));
std::vector<std::vector<int>> oneForwardResponse = {oneForwardTokens, oneForwardTokens};
// Create the request that finishes by the number of tokens
VecTokens finishedTokens{101, 102, 103, 104};
auto finishedRequest
= Request(finishedTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(6000));
std::vector<std::vector<int>> finishedReponse
= {{101, 102, 103, 104, 49849, 225, 49849, 232, 55742}, {101, 102, 103, 104, 49849, 225, 49849, 232, 29082}};
// assume responses will come in FIFO order
std::vector<BeamTokens> refResponses = {immediateCancelResponse, oneForwardResponse, finishedReponse};
std::vector<FinishReason> referenceFinishReasons
= {FinishReason::kTIMED_OUT, FinishReason::kTIMED_OUT, FinishReason::kLENGTH};
if (executor.canEnqueueRequests())
{
std::vector<Request> requests = {immediateCancelRequest, oneForwardRequest, finishedRequest};
auto requestIds = executor.enqueueRequests(requests);
std::chrono::milliseconds waitTime(mMaxWaitMs);
auto responses = executor.awaitResponses(requestIds, waitTime);
for (auto const& response : responses)
{
for (auto const& responseIt : response)
{
auto const reqId = responseIt.getRequestId();
if (responseIt.hasError())
{
TLLM_THROW("Request id %lu encountered error: %s", reqId, responseIt.getErrorMsg().c_str());
}
auto const& result = responseIt.getResult();
auto const finishReason = result.finishReasons;
auto const actualResponse = result.outputTokenIds;
TLLM_LOG_DEBUG("reqId %d finished %d", reqId, result.isFinal);
TLLM_LOG_DEBUG("actual response:");
for (auto const& beam : actualResponse)
{
std::string tokenStr;
for (auto tok : beam)
{
tokenStr += std::to_string(tok) + " ";
}
TLLM_LOG_DEBUG("%s", tokenStr.c_str());
}
TLLM_LOG_DEBUG("reference:");
auto referenceResponse = refResponses[reqId - 1];
for (auto const& beam : referenceResponse)
{
std::string tokenStr;
for (auto tok : beam)
{
tokenStr += std::to_string(tok) + " ";
}
TLLM_LOG_DEBUG("%s", tokenStr.c_str());
}
if (result.isFinal)
{
TLLM_LOG_DEBUG("finishReason");
std::string reasonStr;
for (auto const reason : finishReason)
{
// cast for easier visibility during debugging
EXPECT_EQ(static_cast<int>(reason), static_cast<int>(referenceFinishReasons[reqId - 1]));
reasonStr += std::to_string(static_cast<int>(reason)) + " ";
}
TLLM_LOG_DEBUG("%s", reasonStr.c_str());
}
EXPECT_EQ(beamWidth, actualResponse.size());
for (int beam = 0; beam < beamWidth; beam++)
{
EXPECT_EQ(referenceResponse.at(beam).size(), actualResponse.at(beam).size());
EXPECT_THAT(actualResponse.at(beam), testing::ElementsAreArray(referenceResponse.at(beam)));
}
}
}
}
}
INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamTest,
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(false, true), // excludeInputFromOutput
testing::Values(1, 2) // beamWidth
),
generateTestName);
INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamStatsTest,
testing::Combine( //
testing::Values(0, 1000), // iterStatsMaxIterations
testing::Values(false, true) // useOrchestratorMode
),
generateTestNameStats);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, ParamCancelReqTest,
testing::Combine( //
testing::Values(false, true), // useOrchestratorMode
testing::Values(1, 2), // beamWidth
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
),
generateTestNameCancelReq);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, TimeoutTest,
testing::Combine( //
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp1_pp1_cp1"), // modelName
testing::Values(false, true), // useOrchestratorMode
testing::Values(2) // beamWidth
),
generateTestNameTimeoutTest);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LeaderApiUsageTest,
testing::Combine( //
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
),
generateTestNameLeaderApiUsage);
INSTANTIATE_TEST_SUITE_P(GptExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(1, 2), // beamWidth
testing::Values(true), // computeLogProbs
testing::Values(false, true), // excludeInputInOutput
testing::Values(true), // returnContextLogits
testing::Values(true), // returnGenerationLogits
testing::Values("gpt"), // modelName
testing::Values(false, true), // useOrchestratorMode
testing::Values(false, true), // returnAllGeneratedTokens
testing::Values(1, 2) // numReturnSequences
),
generateTestNameAllParams);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(1, 2), // beamWidth
testing::Values(true), // computeLogProbs
testing::Values(false, true), // excludeInputInOutput
testing::Values(false), // returnContextLogits
testing::Values(true), // returnGenerationLogits
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1"), // modelName
testing::Values(false, true), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
testing::Values(1) // numReturnSequences
),
generateTestNameAllParams);
INSTANTIATE_TEST_SUITE_P(LlamaMultiExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(1, 2), // beamWidth
testing::Values(false), // computeLogProbs
testing::Values(false, true), // excludeInputInOutput
testing::Values(false), // returnContextLogits
testing::Values(false), // returnGenerationLogits
testing::Values("llama_tp1_pp2_cp1"), // modelName
testing::Values(false), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
testing::Values(1) // numReturnSequences
),
generateTestNameAllParams);
INSTANTIATE_TEST_SUITE_P(MedusaExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(1), // beamWidth
testing::Values(false), // computeLogProbs
testing::Values(false, true), // excludeInputInOutput
testing::Values(false), // returnContextLogits
testing::Values(false), // returnGenerationLogits
testing::Values("medusa"), // modelName
testing::Values(false, true), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
testing::Values(1) // numReturnSequences
),
generateTestNameAllParams);
// Disable some of ChatGLM's tests since they are the same as gpt's.
INSTANTIATE_TEST_SUITE_P(ChatGlmExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false), // streaming
testing::Values(1, 2), // beamWidth
testing::Values(false), // computeLogProbs
testing::Values(false), // excludeInputInOutput
testing::Values(false), // returnContextLogits
testing::Values(false), // returnGenerationLogits
testing::Values("chatglm"), // modelName
testing::Values(false), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
testing::Values(1, 2) // numReturnSequences
),
generateTestNameAllParams);
// ChatGlm0 Test is for glm-10b.
INSTANTIATE_TEST_SUITE_P(ChatGlm0ExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false), // streaming
testing::Values(1), // beamWidth
testing::Values(false), // computeLogProbs
testing::Values(false), // excludeInputInOutput
testing::Values(false), // returnContextLogits
testing::Values(false), // returnGenerationLogits
testing::Values("glm"), // modelName
testing::Values(false), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
testing::Values(1) // numReturnSequences
),
generateTestNameAllParams);
INSTANTIATE_TEST_SUITE_P(ChatGlm2ExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false), // streaming
testing::Values(1), // beamWidth
testing::Values(false), // computeLogProbs
testing::Values(false), // excludeInputInOutput
testing::Values(false), // returnContextLogits
testing::Values(false), // returnGenerationLogits
testing::Values("chatglm2"), // modelName
testing::Values(false), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
testing::Values(1) // numReturnSequences
),
generateTestNameAllParams);
INSTANTIATE_TEST_SUITE_P(ChatGlm3ExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false), // streaming
testing::Values(1), // beamWidth
testing::Values(false), // computeLogProbs
testing::Values(false), // excludeInputInOutput
testing::Values(false), // returnContextLogits
testing::Values(false), // returnGenerationLogits
testing::Values("chatglm3"), // modelName
testing::Values(false), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
testing::Values(1) // numReturnSequences
),
generateTestNameAllParams);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LogitsProcParamsTest,
testing::Combine( //
testing::Values(
"llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1"), // modelName
testing::Values(false, true), // batched
testing::Values(false, true) // replicated
),
generateTestNameLogitsProc);
INSTANTIATE_TEST_SUITE_P(GptExecutorGuidedDecodingTest, GuidedDecodingParamsTest,
testing::Combine(testing::Values("gpt")), generateTestNameGuidedDecoding);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorGuidedDecodingTest, GuidedDecodingParamsTest,
testing::Combine(
testing::Values("llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1")),
generateTestNameGuidedDecoding);