test: Reduce number of C++ test cases (#5437)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
This commit is contained in:
Robin Kobus 2025-07-01 09:40:49 +02:00 committed by GitHub
parent 7a617ad1fe
commit 5f77d212ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 182 additions and 162 deletions

View File

@ -1042,7 +1042,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
.useGptAttentionPlugin()
.setKVCacheType(KVCacheType::kDISABLED)
.usePackedInput()),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(
@ -1051,7 +1051,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(false), // enableCudaGraphMode
@ -1068,7 +1068,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
.useGptAttentionPlugin()
.setKVCacheType(KVCacheType::kPAGED)
.usePackedInput()),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(
@ -1077,7 +1077,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(false), // enableCudaGraphMode
@ -1095,13 +1095,13 @@ INSTANTIATE_TEST_SUITE_P(GptKVOffloadingTest, ParamTest,
.setKVCacheType(KVCacheType::kPAGED)
.usePackedInput()
.setKVCacheReuse(true)),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(256), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(false), // enableCudaGraphMode
@ -1134,7 +1134,7 @@ INSTANTIATE_TEST_SUITE_P(GptCudaGraphTests, ParamTest,
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(true), // enableCudaGraphMode
testing::Values(std::nullopt), // hostCacheSize
@ -1176,16 +1176,15 @@ INSTANTIATE_TEST_SUITE_P(GptNProfilesTests, ParamTest,
.usePackedInput()
.setKVCacheType(KVCacheType::kPAGED)
.useMultipleProfiles()),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelIfbTestType::BULK),
testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
testing::Values(
// TODO: enable more tests when mixed beam width is supported
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(true), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(false), // enableCudaGraphMode
testing::Values(std::nullopt), // hostCacheSize
@ -1214,7 +1213,7 @@ INSTANTIATE_TEST_SUITE_P(GptSqTests, ParamTest,
.usePackedInput()
.setKVCacheType(KVCacheType::kDISABLED)
.setQuantMethod(QuantMethod::kSMOOTH_QUANT)),
testing::Values(TrtGptModelType::InflightBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(
@ -1243,7 +1242,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_GptChunkedContextTests, ParamTest,
.usePackedInput()
.setKVCacheType(KVCacheType::kPAGED)
.setMaxInputLength(128)),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelIfbTestType::BULK), // TrtGptModelIfbTestType
testing::Values(BeamConfig{1, {1}}), // beam config
testing::Values(257), // maxTokensInPagedKvCache
@ -1272,7 +1271,7 @@ INSTANTIATE_TEST_SUITE_P(GptChunkedLongContextTests, ParamTest,
.setKVCacheType(KVCacheType::kPAGED)
.useDraftTokensExternalDecoding()
.setDraftTokens(5)),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT,
TrtGptModelIfbTestType::RANDOM), // TrtGptModelIfbTestType
testing::Values(BeamConfig{1, {1}}), // beam config
@ -1310,7 +1309,7 @@ INSTANTIATE_TEST_SUITE_P(GptDraftTests, ParamTest,
.replaceLogits()
.collectGenerationLogitsFile()
.collectContextLogitsFile()),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}), // beamConfig
@ -1391,7 +1390,7 @@ INSTANTIATE_TEST_SUITE_P(GptjTests, ParamTest,
.usePackedInput()
),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
// WAR: disable wavefront and random tests on because of switched beams
testing::Values(TrtGptModelIfbTestType::BULK
/* , TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM */),
@ -1488,7 +1487,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaTests, ParamTest,
.useTensorParallelism(2)
),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(
@ -1570,7 +1569,7 @@ INSTANTIATE_TEST_SUITE_P(MedusaTests, ParamTest,
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(true, false), // enableCudaGraphMode
testing::Values(std::nullopt), // hostCacheSize
@ -1594,7 +1593,7 @@ INSTANTIATE_TEST_SUITE_P(EagleTests, ParamTest,
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(true, false), // enableCudaGraphMode
testing::Values(std::nullopt), // hostCacheSize
@ -1644,7 +1643,7 @@ INSTANTIATE_TEST_SUITE_P(ExplicitDraftTokensDecodingTests, ParamTest,
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
testing::Values(false), // enableCudaGraphMode
testing::Values(std::nullopt), // hostCacheSize
@ -1665,7 +1664,7 @@ INSTANTIATE_TEST_SUITE_P(GptjFP8Tests, ParamTest,
.usePackedInput()
),
testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
testing::Values(TrtGptModelType::InflightFusedBatching),
testing::Values(
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(

View File

@ -291,8 +291,8 @@ void runDisaggTest(tensorrt_llm::testing::disaggexecutor::DisaggExecutorLeader&
++iter;
}
EXPECT_LT(iter, maxWaitMs);
testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
flakyTestInfo, isSpeculativeDecoding, returnAllGeneratedTokens, beamWidth, numReturnSequences, false);
testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
isSpeculativeDecoding, beamWidth, numReturnSequences, false);
}
comm.barrier();
if (executor.isGenerationRank())
@ -449,8 +449,8 @@ void runDisaggTest(DisaggExecutorOrchestrator& executor, tensorrt_llm::runtime::
++iter;
}
EXPECT_LT(iter, maxWaitMs);
testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
flakyTestInfo, isSpeculativeDecoding, returnAllGeneratedTokens, beamWidth, numReturnSequences, false);
testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
isSpeculativeDecoding, beamWidth, numReturnSequences, false);
}
comm.barrier();
}
@ -1110,8 +1110,8 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
++iter;
}
EXPECT_LT(iter, mMaxWaitMs);
testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
flakyTestInfo, isSpeculativeDecoding, false, beamWidth, numReturnSequences, false);
testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
isSpeculativeDecoding, beamWidth, numReturnSequences, false);
}
world_comm.barrier();
#else

View File

@ -567,14 +567,23 @@ TEST_F(GptExecutorTest, GenerationChangeEndId)
}
}
// stream, excludeInputFromOutput, beamWidth
using ParamType = std::tuple<bool, bool, int>;
using ParamCancelReqType = std::tuple<bool, bool, int, int, std::string>;
using LeaderApiUsageType = std::tuple<bool, std::string>;
// useOrchestratorMode, beamWidth, modelName
using ParamCancelReqType = std::tuple<bool, int, std::string>;
// modelName
using LeaderApiUsageType = std::tuple<std::string>;
// iterStatsMaxIterations, useOrchestratorMode
using ParamStatsType = std::tuple<int, bool>;
// streaming, beamWidth, computeLogProbs, excludeInputInOutput, returnContextLogits, returnGenerationLogits, modelName,
// useOrchestratorMode, returnAllGeneratedTokens, numReturnSequences
using AllParamsType = std::tuple<bool, int, bool, bool, bool, bool, std::string, bool, bool, int>;
// modelName, batched, replicated
using LogitsProcParamsType = std::tuple<std::string, bool, bool>;
// modelName
using GuidedDecodingParamsType = std::tuple<std::string>;
using TimeoutTestParamsType = std ::tuple<std::string, bool, int>;
// modelName, useOrchestratorMode, beamWidth
using TimeoutTestParamsType = std::tuple<std::string, bool, int>;
std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
{
@ -596,19 +605,11 @@ std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
std::string generateTestNameCancelReq(testing::TestParamInfo<ParamCancelReqType> const& info)
{
auto const streaming = std::get<0>(info.param);
auto const& useOrchestratorMode = std::get<1>(info.param);
int const beamWidth = std::get<2>(info.param);
int const numReturnSequences = std::get<3>(info.param);
auto const modelName = std::get<4>(info.param);
auto const& useOrchestratorMode = std::get<0>(info.param);
auto const beamWidth = std::get<1>(info.param);
auto const modelName = std::get<2>(info.param);
std::string name = "ExecutorTest";
if (streaming)
{
name += "Streaming";
}
name.append("BW" + std::to_string(beamWidth));
name.append("_numRetSeq" + std::to_string(numReturnSequences));
name.append("_" + modelName + "_");
if (useOrchestratorMode)
@ -624,14 +625,8 @@ std::string generateTestNameCancelReq(testing::TestParamInfo<ParamCancelReqType>
std::string generateTestNameLeaderApiUsage(testing::TestParamInfo<LeaderApiUsageType> const& info)
{
auto const streaming = std::get<0>(info.param);
auto const modelName = std::get<1>(info.param);
auto const modelName = std::get<0>(info.param);
std::string name = "ExecutorTest";
if (streaming)
{
name += "Streaming";
}
name.append("_" + modelName);
return name;
}
@ -1910,9 +1905,6 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
{
auto const beamWidth = beamResult.beamWidth;
std::unordered_map<IdType, SizeType32> reqIdToBatchId;
std::unordered_map<SizeType32, std::vector<BeamTokens>> tokens;
auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
@ -1962,19 +1954,16 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
auto const numSequences = beamWidth > 1 ? 1 : numReturnSequences;
auto const numReturnBeams = std::min(beamWidth, numReturnSequences);
std::vector<IdType> reqIds;
if (worldRank == 0)
{
reqIds = executor.enqueueRequests(std::move(requests));
auto const reqIds = executor.enqueueRequests(requests);
std::unordered_map<SizeType32, std::vector<BeamTokens>> tokens;
std::unordered_map<IdType, SizeType32> reqIdToBatchId;
for (SizeType32 req = 0; req < reqIds.size(); ++req)
{
std::vector<BeamTokens> resultTokens;
resultTokens.reserve(numSequences);
for (SizeType32 seqIdx = 0; seqIdx < numSequences; ++seqIdx)
{
resultTokens.emplace_back(numReturnBeams);
}
std::vector<BeamTokens> resultTokens(numSequences, BeamTokens(numReturnBeams));
tokens[req] = std::move(resultTokens);
reqIdToBatchId[reqIds.at(req)] = req;
}
@ -1982,32 +1971,41 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
// Get the new tokens for each requests
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
std::unordered_map<IdType, SizeType32> numResponses;
while (numFinished < maxRequests && iter < maxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
for (auto& response : responses)
{
numResponses++;
auto batchId = reqIdToBatchId.at(response.getRequestId());
numResponses[batchId]++;
if (!response.hasError())
{
auto result = response.getResult();
numFinished += result.isFinal;
auto batchId = reqIdToBatchId.at(response.getRequestId());
auto seqIdx = result.sequenceIndex;
auto& contextLogits = result.contextLogits;
auto& genLogits = result.generationLogits;
auto& outputTokenIds = result.outputTokenIds;
auto const& contextLogits = result.contextLogits;
auto const& genLogits = result.generationLogits;
auto const& outputTokenIds = result.outputTokenIds;
EXPECT_EQ(result.finishReasons.size(), numReturnBeams);
for (SizeType32 beam = 0; beam < numReturnBeams; ++beam)
{
auto& newTokens = outputTokenIds.at(beam);
auto const& newTokens = outputTokenIds.at(beam);
auto& reqTokens = tokens.at(batchId).at(seqIdx).at(beam);
reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
if (!returnAllGeneratedTokens)
{
reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
}
else
{
EXPECT_EQ(newTokens.size(),
(numResponses.at(batchId) + numReturnSequences - 1) / numReturnSequences);
reqTokens = newTokens;
}
// FinishReason is only supported for bw=1 and inflight batching.
if (beamWidth == 1)
{
@ -2016,9 +2014,9 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
}
}
auto& cumLogProbs = result.cumLogProbs;
auto& logProbs = result.logProbs;
auto& beamTokens = tokens.at(batchId).at(seqIdx);
auto const& cumLogProbs = result.cumLogProbs;
auto const& logProbs = result.logProbs;
auto const& beamTokens = tokens.at(batchId).at(seqIdx);
EXPECT_EQ(beamTokens.size(), numReturnBeams);
if (!isNonGreedySampling)
@ -2057,9 +2055,8 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
++iter;
}
EXPECT_LT(iter, maxWaitMs);
testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
flakyTestInfo, isSpeculativeDecoding, returnAllGeneratedTokens, beamWidth, numSequences,
isNonGreedySampling);
testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
isSpeculativeDecoding, beamWidth, numSequences, isNonGreedySampling);
}
}
@ -2119,10 +2116,6 @@ TEST_P(AllParamsTest, TokenComparison)
{
GTEST_SKIP() << "Test does not support returnAllGeneratedTokens without streaming";
}
if (returnAllGeneratedTokens && outConfig.returnLogProbs)
{
GTEST_SKIP() << "Skip returnAllGeneratedTokens with outConfig.returnLogProbs to reduce number of tests";
}
std::optional<std::vector<SizeType32>> participantIds = std::nullopt;
@ -3761,11 +3754,9 @@ TEST_F(GptExecutorTest, orchModeForwardError)
TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
{
bool const streaming = std::get<0>(GetParam());
bool const useOrchestratorMode = std::get<1>(GetParam());
auto const beamWidth = std::get<2>(GetParam());
auto const numReturnSequences = std::get<3>(GetParam());
auto const modelName = std::get<4>(GetParam());
auto const useOrchestratorMode = std::get<0>(GetParam());
auto const beamWidth = std::get<1>(GetParam());
auto const modelName = std::get<2>(GetParam());
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
@ -3840,39 +3831,59 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
// Create the request
SizeType32 maxNewTokens = 50;
VecTokens inputTokens{1, 2, 3, 4};
auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
auto request = Request(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
auto samplingConfig2 = tensorrt_llm::executor::SamplingConfig(beamWidth);
samplingConfig2.setNumReturnSequences(numReturnSequences);
auto request2 = Request(inputTokens, maxNewTokens, streaming, samplingConfig2, outConfig);
std::vector<Request> requests;
for (auto streaming : {false, true})
{
// Add two requests with numReturnSequences = 1
auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
// Add a request with numReturnSequences > 1
auto samplingConfig2 = tensorrt_llm::executor::SamplingConfig(beamWidth);
auto constexpr numReturnSequences = 2;
samplingConfig2.setNumReturnSequences(numReturnSequences);
requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig2, outConfig);
}
std::vector<bool> cancelRequests{true, false, true, true, false, true};
if (executor.canEnqueueRequests())
{
auto requestId = executor.enqueueRequest(request);
// Enqueue another request
auto requestId2 = executor.enqueueRequest(request);
// Enqueue a request of numReturnSequences > 1
auto requestId3 = executor.enqueueRequest(request2);
auto const requestIds = executor.enqueueRequests(requests);
// Cancel the first and third requests
std::this_thread::sleep_for(std::chrono::milliseconds(100));
executor.cancelRequest(requestId);
executor.cancelRequest(requestId3);
std::this_thread::sleep_for(std::chrono::milliseconds(50));
for (SizeType32 i = 0; i < requests.size(); i++)
{
if (cancelRequests.at(i))
{
executor.cancelRequest(requestIds.at(i));
}
}
SizeType32 expectedNumToken = (streaming ? 0 : inputTokens.size()) + maxNewTokens;
std::unordered_map<IdType, bool> isStreaming;
std::unordered_map<IdType, SizeType32> expectedNumTokens;
SizeType32 expectedNumResponses = 0;
for (SizeType32 i = 0; i < requests.size(); i++)
{
auto const& request = requests.at(i);
auto requestId = requestIds.at(i);
isStreaming[requestId] = request.getStreaming();
expectedNumTokens[requestId] = (request.getStreaming() ? 0 : inputTokens.size()) + maxNewTokens;
auto const numResponses = request.getStreaming() ? expectedNumTokens[requestId] : 1;
auto const numReturnSequences = request.getSamplingConfig().getBeamWidth() > 1
? 1
: request.getSamplingConfig().getNumReturnSequences().value_or(1);
expectedNumResponses += numResponses * numReturnSequences;
}
std::unordered_map<IdType, std::unordered_map<SizeType32, VecTokens>> tokens;
std::unordered_map<IdType, SizeType32> expectedNumTokens;
expectedNumTokens[requestId] = expectedNumToken;
expectedNumTokens[requestId2] = expectedNumToken;
expectedNumTokens[requestId3] = expectedNumToken;
// Get the new tokens for each requests
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
while (numFinished < 3 && iter < mMaxWaitMs)
while (numFinished < requests.size() && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
@ -3881,6 +3892,7 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
numResponses++;
if (!response.hasError())
{
auto requestId = response.getRequestId();
auto result = response.getResult();
numFinished += result.isFinal;
auto seqIdx = result.sequenceIndex;
@ -3888,7 +3900,7 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
auto& newTokens = result.outputTokenIds.at(numSequences - 1);
auto& reqResults = tokens[response.getRequestId()];
auto& reqTokens = reqResults[seqIdx];
if (streaming && beamWidth > 1)
if (isStreaming.at(requestId) && beamWidth > 1)
{
reqTokens = newTokens;
}
@ -3905,25 +3917,34 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
++iter;
}
EXPECT_LE(numResponses, streaming ? 4 * expectedNumToken : 4);
EXPECT_EQ(numFinished, 3);
EXPECT_LE(numResponses, expectedNumResponses);
EXPECT_EQ(numFinished, requests.size());
EXPECT_LT(iter, mMaxWaitMs);
EXPECT_LT(tokens[requestId][0].size(), expectedNumTokens[requestId]);
EXPECT_EQ(tokens[requestId2][0].size(), expectedNumTokens[requestId2]);
for (auto seqIdx = 0; seqIdx < tokens[requestId3].size(); seqIdx++)
for (auto requestIdx = 0; requestIdx < requests.size(); requestIdx++)
{
EXPECT_LT(tokens[requestId3][seqIdx].size(), expectedNumTokens[requestId3]);
auto const requestId = requestIds.at(requestIdx);
for (auto seqIdx = 0; seqIdx < tokens.at(requestId).size(); seqIdx++)
{
auto const& seqTokens = tokens.at(requestId).at(seqIdx);
if (cancelRequests.at(requestIdx))
{
EXPECT_LT(seqTokens.size(), expectedNumTokens.at(requestId));
}
else
{
EXPECT_EQ(seqTokens.size(), expectedNumTokens.at(requestId));
}
}
}
}
}
TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
TEST_P(LeaderApiUsageTest, LeaderModeTest)
{
bool const streaming = std::get<0>(GetParam());
auto const modelName = std::get<1>(GetParam());
auto const modelName = std::get<0>(GetParam());
SizeType32 beamWidth = 32;
SizeType32 beamWidth = 2;
OutputConfig outConfig;
std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;
@ -3980,18 +4001,22 @@ TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
SizeType32 maxNewTokens = 50;
VecTokens inputTokens{1, 2, 3, 4};
auto request
= Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
= Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
auto requestStreaming
= Request(inputTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
// Leader enqueues requests and wait for responses
if (executor.canEnqueueRequests())
{
auto requestId = executor.enqueueRequest(request);
auto requestId2 = executor.enqueueRequest(request);
auto requestId3 = executor.enqueueRequest(requestStreaming);
auto requestId4 = executor.enqueueRequest(requestStreaming);
int32_t numFinished = 0;
int iter = 0;
SizeType32 numResponses = 0;
while (numFinished < 2 && iter < mMaxWaitMs)
while (numFinished < 4 && iter < mMaxWaitMs)
{
std::chrono::milliseconds waitTime(1);
auto responses = executor.awaitResponses(waitTime);
@ -4010,7 +4035,7 @@ TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
}
++iter;
}
EXPECT_EQ(numFinished, 2);
EXPECT_EQ(numFinished, 4);
EXPECT_LT(iter, mMaxWaitMs);
}
else
@ -4022,7 +4047,6 @@ TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
EXPECT_THROW({ executor.cancelRequest(1); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ auto stats = executor.getLatestIterationStats(); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ auto stats = executor.getLatestRequestStats(); }, tensorrt_llm::common::TllmException);
EXPECT_THROW({ executor.shutdown(); }, tensorrt_llm::common::TllmException);
}
}
@ -4367,7 +4391,7 @@ TEST_P(TimeoutTest, TimeoutNonstreamingTest)
VecTokens finishedTokens{101, 102, 103, 104};
auto finishedRequest
= Request(finishedTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(5000));
finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(6000));
std::vector<std::vector<int>> finishedReponse
= {{101, 102, 103, 104, 49849, 225, 49849, 232, 55742}, {101, 102, 103, 104, 49849, 225, 49849, 232, 29082}};
@ -4447,38 +4471,50 @@ TEST_P(TimeoutTest, TimeoutNonstreamingTest)
}
INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamTest,
testing::Combine(testing::Values(false, true), // streaming
testing::Values(false, true), // excludeInputFromOutput
testing::Values(1, 2) // beamWidth
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(false, true), // excludeInputFromOutput
testing::Values(1, 2) // beamWidth
),
generateTestName);
INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamStatsTest,
testing::Combine(testing::Values(0, 1000), testing::Values(false, true)), generateTestNameStats);
testing::Combine( //
testing::Values(0, 1000), // iterStatsMaxIterations
testing::Values(false, true) // useOrchestratorMode
),
generateTestNameStats);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, ParamCancelReqTest,
testing::Combine(testing::Values(false, true), testing::Values(false, true), testing::Values(1, 2),
testing::Values(1, 2), testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1")),
testing::Combine( //
testing::Values(false, true), // useOrchestratorMode
testing::Values(1, 2), // beamWidth
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
),
generateTestNameCancelReq);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, TimeoutTest,
testing::Combine(testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp1_pp1_cp1"),
testing::Values(false, true), testing::Values(2)),
testing::Combine( //
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp1_pp1_cp1"), // modelName
testing::Values(false, true), // useOrchestratorMode
testing::Values(2) // beamWidth
),
generateTestNameTimeoutTest);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LeaderApiUsageTest,
testing::Combine(
testing::Values(false, true), testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1")),
testing::Combine( //
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
),
generateTestNameLeaderApiUsage);
INSTANTIATE_TEST_SUITE_P(GptExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(1, 2), // beamWidth
testing::Values(false, true), // computeLogProbs
testing::Values(true), // computeLogProbs
testing::Values(false, true), // excludeInputInOutput
testing::Values(false, true), // returnContextLogits
testing::Values(false, true), // returnGenerationLogits
testing::Values(true), // returnContextLogits
testing::Values(true), // returnGenerationLogits
testing::Values("gpt"), // modelName
testing::Values(false, true), // useOrchestratorMode
testing::Values(false, true), // returnAllGeneratedTokens
@ -4490,10 +4526,10 @@ INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, AllParamsTest,
testing::Combine( //
testing::Values(false, true), // streaming
testing::Values(1, 2), // beamWidth
testing::Values(false, true), // computeLogProbs
testing::Values(false), // computeLogProbs
testing::Values(false, true), // excludeInputInOutput
testing::Values(false, true), // returnContextLogits
testing::Values(false, true), // returnGenerationLogits
testing::Values(false), // returnContextLogits
testing::Values(false), // returnGenerationLogits
testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1"), // modelName
testing::Values(false, true), // useOrchestratorMode
testing::Values(false), // returnAllGeneratedTokens
@ -4594,9 +4630,12 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm3ExecutorTest, AllParamsTest,
generateTestNameAllParams);
INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LogitsProcParamsTest,
testing::Combine(
testing::Values("llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1"),
testing::Values(false, true), testing::Values(false, true)),
testing::Combine( //
testing::Values(
"llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1"), // modelName
testing::Values(false, true), // batched
testing::Values(false, true) // replicated
),
generateTestNameLogitsProc);
INSTANTIATE_TEST_SUITE_P(GptExecutorGuidedDecodingTest, GuidedDecodingParamsTest,

View File

@ -408,9 +408,9 @@ TestData TestData::loadTestData(BeamResult const& beamResults, ITensor const& gi
}
void TestData::verifyOutput(std::unordered_map<SizeType32, std::vector<executor::BeamTokens>> const& resultTokens,
std::vector<SizeType32> const& givenInputLengths, SizeType32 nbGivenInputs, bool streaming,
bool excludeInputFromOutput, FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding, bool returnAllGeneratedTokens,
SizeType32 reqBeamWidth, SizeType32 numReturnSequences, bool isNonGreedySampling)
std::vector<SizeType32> const& givenInputLengths, bool streaming, bool excludeInputFromOutput,
FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding, SizeType32 reqBeamWidth, SizeType32 numReturnSequences,
bool isNonGreedySampling)
{
for (auto const& [batchId, beamTokens] : resultTokens)
{
@ -436,12 +436,6 @@ void TestData::verifyOutput(std::unordered_map<SizeType32, std::vector<executor:
= expectedOutputLengths[batchId * reqBeamWidth + beam]; // Ground truth output length
auto expectedOutputLength
= expectInputOutputLength - inputLength; // Number of new generated output tokens
if (returnAllGeneratedTokens)
{
// If returnAllGeneratedTokens, then the tokens of each iteration will contain all the previously
// generated tokens. Such as: [(a), (a, b), (a, b, c), (a, b, c, d), ...]
expectedOutputLength = (1 + expectedOutputLength) * expectedOutputLength / 2;
}
bool inputNotIncluded = (streaming || excludeInputFromOutput);
bool anyMismatch = false;
@ -458,12 +452,6 @@ void TestData::verifyOutput(std::unordered_map<SizeType32, std::vector<executor:
<< "b: " << batchId << " seq: " << seqIdx << " beam: " << beam;
}
if (returnAllGeneratedTokens)
{
// If returnAllGeneratedTokens, the output of the last iteration will contain all the output tokens
predictedTokens.erase(
predictedTokens.begin(), predictedTokens.end() - (expectInputOutputLength - inputLength));
}
auto numPredTokens = static_cast<SizeType32>(predictedTokens.size());
if (isSpeculativeDecoding)
@ -631,7 +619,7 @@ void TestData::validateGenerationLogits(bool getGenLogits, bool isFinal, bool st
SizeType32 numGeneratedToken = genLogits.value().getShape()[0];
if (returnAllGeneratedTokens)
{
EXPECT_EQ((numGeneratedToken + 1) * numGeneratedToken / 2, numPredTokens);
EXPECT_EQ(numGeneratedToken, numPredTokens);
}
else
{

View File

@ -187,9 +187,8 @@ public:
tr::BufferManager& manager, executor::OutputConfig const& outConfig, ModelIds const& modelIds);
void verifyOutput(std::unordered_map<SizeType32, std::vector<executor::BeamTokens>> const& resultTokens,
std::vector<SizeType32> const& givenInputLengths, SizeType32 nbGivenInputs, bool streaming,
bool excludeInputFromOutput, FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding,
bool returnAllGeneratedTokens, SizeType32 reqBeamWidth, SizeType32 numReturnSequences,
std::vector<SizeType32> const& givenInputLengths, bool streaming, bool excludeInputFromOutput,
FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding, SizeType32 reqBeamWidth, SizeType32 numReturnSequences,
bool isNonGreedySampling);
void verifyLogProbs(bool computeLogProbs, bool streaming, bool excludeInputFromOutput, SizeType32 inputLength,

View File

@ -478,12 +478,7 @@ def test_enc_dec(build_google_tests, multi_gpu_model, build_dir):
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
@pytest.mark.parametrize("mode", [
"orchestrator",
pytest.param(
"leader",
marks=pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5026255"))
])
@pytest.mark.parametrize("mode", ["orchestrator", "leader"])
@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
def test_llama_executor(build_google_tests, multi_gpu_model, mode, lora_setup,
build_dir):