test: Reduce number of C++ test cases (#5437)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-07-01 09:40:49 +02:00 · 2025-07-01 09:40:49 +02:00 · 5f77d212ef
commit 5f77d212ef
parent 7a617ad1fe
6 changed files with 182 additions and 162 deletions
--- a/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp
+++ b/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp
@ -1042,7 +1042,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
                .useGptAttentionPlugin()
                .setKVCacheType(KVCacheType::kDISABLED)
                .usePackedInput()),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(
@ -1051,7 +1051,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
            ),
        testing::Values(std::nullopt, 1280),       // maxTokensInPagedKvCache
        testing::Values(std::nullopt, 0.4),        // freeGpuMemoryFraction
-        testing::Values(false, true),              // enableTrtOverlap
+        testing::Values(true),                     // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
        testing::Values(false),                    // enableCudaGraphMode
@ -1068,7 +1068,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
                .useGptAttentionPlugin()
                .setKVCacheType(KVCacheType::kPAGED)
                .usePackedInput()),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(
@ -1077,7 +1077,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
            ),
        testing::Values(std::nullopt, 1280),       // maxTokensInPagedKvCache
        testing::Values(std::nullopt, 0.4),        // freeGpuMemoryFraction
-        testing::Values(false, true),              // enableTrtOverlap
+        testing::Values(true),                     // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
        testing::Values(false),                    // enableCudaGraphMode
@ -1095,13 +1095,13 @@ INSTANTIATE_TEST_SUITE_P(GptKVOffloadingTest, ParamTest,
                .setKVCacheType(KVCacheType::kPAGED)
                .usePackedInput()
                .setKVCacheReuse(true)),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(256),               // maxTokensInPagedKvCache
        testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
-        testing::Values(false, true),       // enableTrtOverlap
+        testing::Values(true),              // enableTrtOverlap
        testing::Values(false),             // enableChunkedContext
        testing::Values(false),             // enableStreamingMode
        testing::Values(false),             // enableCudaGraphMode
@ -1134,7 +1134,7 @@ INSTANTIATE_TEST_SUITE_P(GptCudaGraphTests, ParamTest,
        testing::Values(std::nullopt),             // maxTokensInPagedKvCache
        testing::Values(0.4),                      // freeGpuMemoryFraction
        testing::Values(false),                    // enableTrtOverlap
-        testing::Values(false, true),              // enableChunkedContext
+        testing::Values(true),                     // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
        testing::Values(true),                     // enableCudaGraphMode
        testing::Values(std::nullopt),             // hostCacheSize
@ -1176,16 +1176,15 @@ INSTANTIATE_TEST_SUITE_P(GptNProfilesTests, ParamTest,
                            .usePackedInput()
                            .setKVCacheType(KVCacheType::kPAGED)
                            .useMultipleProfiles()),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
-        testing::Values(TrtGptModelIfbTestType::BULK),
+        testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
        testing::Values(
            // TODO: enable more tests when mixed beam width is supported
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt, 1280),       // maxTokensInPagedKvCache
        testing::Values(std::nullopt, 0.4),        // freeGpuMemoryFraction
-        testing::Values(false, true),              // enableTrtOverlap
-        testing::Values(false, true),              // enableChunkedContext
+        testing::Values(true),                     // enableTrtOverlap
+        testing::Values(true),                     // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
        testing::Values(false),                    // enableCudaGraphMode
        testing::Values(std::nullopt),             // hostCacheSize
@ -1214,7 +1213,7 @@ INSTANTIATE_TEST_SUITE_P(GptSqTests, ParamTest,
                .usePackedInput()
                .setKVCacheType(KVCacheType::kDISABLED)
                .setQuantMethod(QuantMethod::kSMOOTH_QUANT)),
-        testing::Values(TrtGptModelType::InflightBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(
@ -1243,7 +1242,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_GptChunkedContextTests, ParamTest,
                .usePackedInput()
                .setKVCacheType(KVCacheType::kPAGED)
                .setMaxInputLength(128)),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(TrtGptModelIfbTestType::BULK), // TrtGptModelIfbTestType
        testing::Values(BeamConfig{1, {1}}),           // beam config
        testing::Values(257),                          // maxTokensInPagedKvCache
@ -1272,7 +1271,7 @@ INSTANTIATE_TEST_SUITE_P(GptChunkedLongContextTests, ParamTest,
                .setKVCacheType(KVCacheType::kPAGED)
                .useDraftTokensExternalDecoding()
                .setDraftTokens(5)),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT,
            TrtGptModelIfbTestType::RANDOM), // TrtGptModelIfbTestType
        testing::Values(BeamConfig{1, {1}}), // beam config
@ -1310,7 +1309,7 @@ INSTANTIATE_TEST_SUITE_P(GptDraftTests, ParamTest,
                .replaceLogits()
                .collectGenerationLogitsFile()
                .collectContextLogitsFile()),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}), // beamConfig
@ -1391,7 +1390,7 @@ INSTANTIATE_TEST_SUITE_P(GptjTests, ParamTest,
                .usePackedInput()

                ),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        // WAR: disable wavefront and random tests on because of switched beams
        testing::Values(TrtGptModelIfbTestType::BULK
            /* , TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM */),
@ -1488,7 +1487,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaTests, ParamTest,
                .useTensorParallelism(2)

                ),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(
@ -1570,7 +1569,7 @@ INSTANTIATE_TEST_SUITE_P(MedusaTests, ParamTest,
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
-        testing::Values(false, true),  // enableChunkedContext
+        testing::Values(true),         // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
        testing::Values(true, false),  // enableCudaGraphMode
        testing::Values(std::nullopt), // hostCacheSize
@ -1594,7 +1593,7 @@ INSTANTIATE_TEST_SUITE_P(EagleTests, ParamTest,
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
-        testing::Values(false, true),  // enableChunkedContext
+        testing::Values(true),         // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
        testing::Values(true, false),  // enableCudaGraphMode
        testing::Values(std::nullopt), // hostCacheSize
@ -1644,7 +1643,7 @@ INSTANTIATE_TEST_SUITE_P(ExplicitDraftTokensDecodingTests, ParamTest,
        testing::Values(std::nullopt),       // maxTokensInPagedKvCache
        testing::Values(0.4),                // freeGpuMemoryFraction
        testing::Values(false),              // enableTrtOverlap
-        testing::Values(false, true),        // enableChunkedContext
+        testing::Values(true),               // enableChunkedContext
        testing::Values(false),              // enableStreamingMode
        testing::Values(false),              // enableCudaGraphMode
        testing::Values(std::nullopt),       // hostCacheSize
@ -1665,7 +1664,7 @@ INSTANTIATE_TEST_SUITE_P(GptjFP8Tests, ParamTest,
                .usePackedInput()

                ),
-        testing::Values(TrtGptModelType::InflightBatching, TrtGptModelType::InflightFusedBatching),
+        testing::Values(TrtGptModelType::InflightFusedBatching),
        testing::Values(
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(
--- a/cpp/tests/executor/disaggExecutorTest.cpp
+++ b/cpp/tests/executor/disaggExecutorTest.cpp
@ -291,8 +291,8 @@ void runDisaggTest(tensorrt_llm::testing::disaggexecutor::DisaggExecutorLeader&
            ++iter;
        }
        EXPECT_LT(iter, maxWaitMs);
-        testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
-            flakyTestInfo, isSpeculativeDecoding, returnAllGeneratedTokens, beamWidth, numReturnSequences, false);
+        testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
+            isSpeculativeDecoding, beamWidth, numReturnSequences, false);
    }
    comm.barrier();
    if (executor.isGenerationRank())
@ -449,8 +449,8 @@ void runDisaggTest(DisaggExecutorOrchestrator& executor, tensorrt_llm::runtime::
            ++iter;
        }
        EXPECT_LT(iter, maxWaitMs);
-        testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
-            flakyTestInfo, isSpeculativeDecoding, returnAllGeneratedTokens, beamWidth, numReturnSequences, false);
+        testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
+            isSpeculativeDecoding, beamWidth, numReturnSequences, false);
    }
    comm.barrier();
 }
@ -1110,8 +1110,8 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
            ++iter;
        }
        EXPECT_LT(iter, mMaxWaitMs);
-        testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
-            flakyTestInfo, isSpeculativeDecoding, false, beamWidth, numReturnSequences, false);
+        testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
+            isSpeculativeDecoding, beamWidth, numReturnSequences, false);
    }
    world_comm.barrier();
 #else
--- a/cpp/tests/executor/executorTest.cpp
+++ b/cpp/tests/executor/executorTest.cpp
@ -567,14 +567,23 @@ TEST_F(GptExecutorTest, GenerationChangeEndId)
    }
 }

+// stream, excludeInputFromOutput, beamWidth
 using ParamType = std::tuple<bool, bool, int>;
-using ParamCancelReqType = std::tuple<bool, bool, int, int, std::string>;
-using LeaderApiUsageType = std::tuple<bool, std::string>;
+// useOrchestratorMode, beamWidth, modelName
+using ParamCancelReqType = std::tuple<bool, int, std::string>;
+// modelName
+using LeaderApiUsageType = std::tuple<std::string>;
+// iterStatsMaxIterations, useOrchestratorMode
 using ParamStatsType = std::tuple<int, bool>;
+// streaming, beamWidth, computeLogProbs, excludeInputInOutput, returnContextLogits, returnGenerationLogits, modelName,
+// useOrchestratorMode, returnAllGeneratedTokens, numReturnSequences
 using AllParamsType = std::tuple<bool, int, bool, bool, bool, bool, std::string, bool, bool, int>;
+// modelName, batched, replicated
 using LogitsProcParamsType = std::tuple<std::string, bool, bool>;
+// modelName
 using GuidedDecodingParamsType = std::tuple<std::string>;
-using TimeoutTestParamsType = std ::tuple<std::string, bool, int>;
+// modelName, useOrchestratorMode, beamWidth
+using TimeoutTestParamsType = std::tuple<std::string, bool, int>;

 std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
 {
@ -596,19 +605,11 @@ std::string generateTestName(testing::TestParamInfo<ParamType> const& info)

 std::string generateTestNameCancelReq(testing::TestParamInfo<ParamCancelReqType> const& info)
 {
-    auto const streaming = std::get<0>(info.param);
-    auto const& useOrchestratorMode = std::get<1>(info.param);
-    int const beamWidth = std::get<2>(info.param);
-    int const numReturnSequences = std::get<3>(info.param);
-    auto const modelName = std::get<4>(info.param);
+    auto const& useOrchestratorMode = std::get<0>(info.param);
+    auto const beamWidth = std::get<1>(info.param);
+    auto const modelName = std::get<2>(info.param);
    std::string name = "ExecutorTest";
-    if (streaming)
-    {
-        name += "Streaming";
-    }
-
    name.append("BW" + std::to_string(beamWidth));
-    name.append("_numRetSeq" + std::to_string(numReturnSequences));
    name.append("_" + modelName + "_");

    if (useOrchestratorMode)
@ -624,14 +625,8 @@ std::string generateTestNameCancelReq(testing::TestParamInfo<ParamCancelReqType>

 std::string generateTestNameLeaderApiUsage(testing::TestParamInfo<LeaderApiUsageType> const& info)
 {
-    auto const streaming = std::get<0>(info.param);
-    auto const modelName = std::get<1>(info.param);
+    auto const modelName = std::get<0>(info.param);
    std::string name = "ExecutorTest";
-    if (streaming)
-    {
-        name += "Streaming";
-    }
-
    name.append("_" + modelName);
    return name;
 }
@ -1910,9 +1905,6 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
 {
    auto const beamWidth = beamResult.beamWidth;

-    std::unordered_map<IdType, SizeType32> reqIdToBatchId;
-    std::unordered_map<SizeType32, std::vector<BeamTokens>> tokens;
-
    auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
    auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
    auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
@ -1962,19 +1954,16 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
    auto const numSequences = beamWidth > 1 ? 1 : numReturnSequences;
    auto const numReturnBeams = std::min(beamWidth, numReturnSequences);

-    std::vector<IdType> reqIds;
    if (worldRank == 0)
    {
-        reqIds = executor.enqueueRequests(std::move(requests));
+        auto const reqIds = executor.enqueueRequests(requests);
+
+        std::unordered_map<SizeType32, std::vector<BeamTokens>> tokens;
+        std::unordered_map<IdType, SizeType32> reqIdToBatchId;

        for (SizeType32 req = 0; req < reqIds.size(); ++req)
        {
-            std::vector<BeamTokens> resultTokens;
-            resultTokens.reserve(numSequences);
-            for (SizeType32 seqIdx = 0; seqIdx < numSequences; ++seqIdx)
-            {
-                resultTokens.emplace_back(numReturnBeams);
-            }
+            std::vector<BeamTokens> resultTokens(numSequences, BeamTokens(numReturnBeams));
            tokens[req] = std::move(resultTokens);
            reqIdToBatchId[reqIds.at(req)] = req;
        }
@ -1982,32 +1971,41 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
        // Get the new tokens for each requests
        int32_t numFinished = 0;
        int iter = 0;
-        SizeType32 numResponses = 0;
+        std::unordered_map<IdType, SizeType32> numResponses;
        while (numFinished < maxRequests && iter < maxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
            for (auto& response : responses)
            {
-                numResponses++;
+                auto batchId = reqIdToBatchId.at(response.getRequestId());
+                numResponses[batchId]++;
                if (!response.hasError())
                {
                    auto result = response.getResult();
                    numFinished += result.isFinal;
-                    auto batchId = reqIdToBatchId.at(response.getRequestId());
                    auto seqIdx = result.sequenceIndex;

-                    auto& contextLogits = result.contextLogits;
-                    auto& genLogits = result.generationLogits;
-                    auto& outputTokenIds = result.outputTokenIds;
+                    auto const& contextLogits = result.contextLogits;
+                    auto const& genLogits = result.generationLogits;
+                    auto const& outputTokenIds = result.outputTokenIds;

                    EXPECT_EQ(result.finishReasons.size(), numReturnBeams);
                    for (SizeType32 beam = 0; beam < numReturnBeams; ++beam)
                    {
-                        auto& newTokens = outputTokenIds.at(beam);
+                        auto const& newTokens = outputTokenIds.at(beam);
                        auto& reqTokens = tokens.at(batchId).at(seqIdx).at(beam);

-                        reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
+                        if (!returnAllGeneratedTokens)
+                        {
+                            reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
+                        }
+                        else
+                        {
+                            EXPECT_EQ(newTokens.size(),
+                                (numResponses.at(batchId) + numReturnSequences - 1) / numReturnSequences);
+                            reqTokens = newTokens;
+                        }
                        // FinishReason is only supported for bw=1 and inflight batching.
                        if (beamWidth == 1)
                        {
@ -2016,9 +2014,9 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
                        }
                    }

-                    auto& cumLogProbs = result.cumLogProbs;
-                    auto& logProbs = result.logProbs;
-                    auto& beamTokens = tokens.at(batchId).at(seqIdx);
+                    auto const& cumLogProbs = result.cumLogProbs;
+                    auto const& logProbs = result.logProbs;
+                    auto const& beamTokens = tokens.at(batchId).at(seqIdx);
                    EXPECT_EQ(beamTokens.size(), numReturnBeams);

                    if (!isNonGreedySampling)
@ -2057,9 +2055,8 @@ void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& mode
            ++iter;
        }
        EXPECT_LT(iter, maxWaitMs);
-        testData.verifyOutput(tokens, givenInputLengths, nbGivenInputs, streaming, outConfig.excludeInputFromOutput,
-            flakyTestInfo, isSpeculativeDecoding, returnAllGeneratedTokens, beamWidth, numSequences,
-            isNonGreedySampling);
+        testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
+            isSpeculativeDecoding, beamWidth, numSequences, isNonGreedySampling);
    }
 }

@ -2119,10 +2116,6 @@ TEST_P(AllParamsTest, TokenComparison)
    {
        GTEST_SKIP() << "Test does not support returnAllGeneratedTokens without streaming";
    }
-    if (returnAllGeneratedTokens && outConfig.returnLogProbs)
-    {
-        GTEST_SKIP() << "Skip returnAllGeneratedTokens with outConfig.returnLogProbs to reduce number of tests";
-    }

    std::optional<std::vector<SizeType32>> participantIds = std::nullopt;

@ -3761,11 +3754,9 @@ TEST_F(GptExecutorTest, orchModeForwardError)

 TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
 {
-    bool const streaming = std::get<0>(GetParam());
-    bool const useOrchestratorMode = std::get<1>(GetParam());
-    auto const beamWidth = std::get<2>(GetParam());
-    auto const numReturnSequences = std::get<3>(GetParam());
-    auto const modelName = std::get<4>(GetParam());
+    auto const useOrchestratorMode = std::get<0>(GetParam());
+    auto const beamWidth = std::get<1>(GetParam());
+    auto const modelName = std::get<2>(GetParam());

    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

@ -3840,39 +3831,59 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
    // Create the request
    SizeType32 maxNewTokens = 50;
    VecTokens inputTokens{1, 2, 3, 4};
-    auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
-    auto request = Request(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
-    auto samplingConfig2 = tensorrt_llm::executor::SamplingConfig(beamWidth);
-    samplingConfig2.setNumReturnSequences(numReturnSequences);
-    auto request2 = Request(inputTokens, maxNewTokens, streaming, samplingConfig2, outConfig);
+
+    std::vector<Request> requests;
+    for (auto streaming : {false, true})
+    {
+        // Add two requests with numReturnSequences = 1
+        auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
+        requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
+        requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
+        // Add a request with numReturnSequences > 1
+        auto samplingConfig2 = tensorrt_llm::executor::SamplingConfig(beamWidth);
+        auto constexpr numReturnSequences = 2;
+        samplingConfig2.setNumReturnSequences(numReturnSequences);
+        requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig2, outConfig);
+    }
+    std::vector<bool> cancelRequests{true, false, true, true, false, true};

    if (executor.canEnqueueRequests())
    {
-        auto requestId = executor.enqueueRequest(request);
-        // Enqueue another request
-        auto requestId2 = executor.enqueueRequest(request);
-        // Enqueue a request of numReturnSequences > 1
-        auto requestId3 = executor.enqueueRequest(request2);
+        auto const requestIds = executor.enqueueRequests(requests);

        // Cancel the first and third requests
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        executor.cancelRequest(requestId);
-        executor.cancelRequest(requestId3);
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        for (SizeType32 i = 0; i < requests.size(); i++)
+        {
+            if (cancelRequests.at(i))
+            {
+                executor.cancelRequest(requestIds.at(i));
+            }
+        }

-        SizeType32 expectedNumToken = (streaming ? 0 : inputTokens.size()) + maxNewTokens;
+        std::unordered_map<IdType, bool> isStreaming;
+        std::unordered_map<IdType, SizeType32> expectedNumTokens;
+        SizeType32 expectedNumResponses = 0;
+        for (SizeType32 i = 0; i < requests.size(); i++)
+        {
+            auto const& request = requests.at(i);
+            auto requestId = requestIds.at(i);
+            isStreaming[requestId] = request.getStreaming();
+            expectedNumTokens[requestId] = (request.getStreaming() ? 0 : inputTokens.size()) + maxNewTokens;
+            auto const numResponses = request.getStreaming() ? expectedNumTokens[requestId] : 1;
+            auto const numReturnSequences = request.getSamplingConfig().getBeamWidth() > 1
+                ? 1
+                : request.getSamplingConfig().getNumReturnSequences().value_or(1);
+            expectedNumResponses += numResponses * numReturnSequences;
+        }

        std::unordered_map<IdType, std::unordered_map<SizeType32, VecTokens>> tokens;

-        std::unordered_map<IdType, SizeType32> expectedNumTokens;
-        expectedNumTokens[requestId] = expectedNumToken;
-        expectedNumTokens[requestId2] = expectedNumToken;
-        expectedNumTokens[requestId3] = expectedNumToken;
-
        // Get the new tokens for each requests
        int32_t numFinished = 0;
        int iter = 0;
        SizeType32 numResponses = 0;
-        while (numFinished < 3 && iter < mMaxWaitMs)
+        while (numFinished < requests.size() && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
@ -3881,6 +3892,7 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
                numResponses++;
                if (!response.hasError())
                {
+                    auto requestId = response.getRequestId();
                    auto result = response.getResult();
                    numFinished += result.isFinal;
                    auto seqIdx = result.sequenceIndex;
@ -3888,7 +3900,7 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
                    auto& newTokens = result.outputTokenIds.at(numSequences - 1);
                    auto& reqResults = tokens[response.getRequestId()];
                    auto& reqTokens = reqResults[seqIdx];
-                    if (streaming && beamWidth > 1)
+                    if (isStreaming.at(requestId) && beamWidth > 1)
                    {
                        reqTokens = newTokens;
                    }
@ -3905,25 +3917,34 @@ TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
            ++iter;
        }

-        EXPECT_LE(numResponses, streaming ? 4 * expectedNumToken : 4);
-        EXPECT_EQ(numFinished, 3);
+        EXPECT_LE(numResponses, expectedNumResponses);
+        EXPECT_EQ(numFinished, requests.size());
        EXPECT_LT(iter, mMaxWaitMs);

-        EXPECT_LT(tokens[requestId][0].size(), expectedNumTokens[requestId]);
-        EXPECT_EQ(tokens[requestId2][0].size(), expectedNumTokens[requestId2]);
-        for (auto seqIdx = 0; seqIdx < tokens[requestId3].size(); seqIdx++)
+        for (auto requestIdx = 0; requestIdx < requests.size(); requestIdx++)
        {
-            EXPECT_LT(tokens[requestId3][seqIdx].size(), expectedNumTokens[requestId3]);
+            auto const requestId = requestIds.at(requestIdx);
+            for (auto seqIdx = 0; seqIdx < tokens.at(requestId).size(); seqIdx++)
+            {
+                auto const& seqTokens = tokens.at(requestId).at(seqIdx);
+                if (cancelRequests.at(requestIdx))
+                {
+                    EXPECT_LT(seqTokens.size(), expectedNumTokens.at(requestId));
+                }
+                else
+                {
+                    EXPECT_EQ(seqTokens.size(), expectedNumTokens.at(requestId));
+                }
+            }
        }
    }
 }

-TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
+TEST_P(LeaderApiUsageTest, LeaderModeTest)
 {
-    bool const streaming = std::get<0>(GetParam());
-    auto const modelName = std::get<1>(GetParam());
+    auto const modelName = std::get<0>(GetParam());

-    SizeType32 beamWidth = 32;
+    SizeType32 beamWidth = 2;
    OutputConfig outConfig;
    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

@ -3980,18 +4001,22 @@ TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
    SizeType32 maxNewTokens = 50;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request
-        = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
+        = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
+    auto requestStreaming
+        = Request(inputTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);

    // Leader enqueues requests and wait for responses
    if (executor.canEnqueueRequests())
    {
        auto requestId = executor.enqueueRequest(request);
        auto requestId2 = executor.enqueueRequest(request);
+        auto requestId3 = executor.enqueueRequest(requestStreaming);
+        auto requestId4 = executor.enqueueRequest(requestStreaming);

        int32_t numFinished = 0;
        int iter = 0;
        SizeType32 numResponses = 0;
-        while (numFinished < 2 && iter < mMaxWaitMs)
+        while (numFinished < 4 && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
@ -4010,7 +4035,7 @@ TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
            }
            ++iter;
        }
-        EXPECT_EQ(numFinished, 2);
+        EXPECT_EQ(numFinished, 4);
        EXPECT_LT(iter, mMaxWaitMs);
    }
    else
@ -4022,7 +4047,6 @@ TEST_P(LeaderApiUsageTest, LeaderApiUsageTest)
        EXPECT_THROW({ executor.cancelRequest(1); }, tensorrt_llm::common::TllmException);
        EXPECT_THROW({ auto stats = executor.getLatestIterationStats(); }, tensorrt_llm::common::TllmException);
        EXPECT_THROW({ auto stats = executor.getLatestRequestStats(); }, tensorrt_llm::common::TllmException);
-        EXPECT_THROW({ executor.shutdown(); }, tensorrt_llm::common::TllmException);
    }
 }

@ -4367,7 +4391,7 @@ TEST_P(TimeoutTest, TimeoutNonstreamingTest)
    VecTokens finishedTokens{101, 102, 103, 104};
    auto finishedRequest
        = Request(finishedTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
-    finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(5000));
+    finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(6000));
    std::vector<std::vector<int>> finishedReponse
        = {{101, 102, 103, 104, 49849, 225, 49849, 232, 55742}, {101, 102, 103, 104, 49849, 225, 49849, 232, 29082}};

@ -4447,38 +4471,50 @@ TEST_P(TimeoutTest, TimeoutNonstreamingTest)
 }

 INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamTest,
-    testing::Combine(testing::Values(false, true), // streaming
-        testing::Values(false, true),              // excludeInputFromOutput
-        testing::Values(1, 2)                      // beamWidth
+    testing::Combine(                 //
+        testing::Values(false, true), // streaming
+        testing::Values(false, true), // excludeInputFromOutput
+        testing::Values(1, 2)         // beamWidth
        ),
    generateTestName);

 INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamStatsTest,
-    testing::Combine(testing::Values(0, 1000), testing::Values(false, true)), generateTestNameStats);
+    testing::Combine(                //
+        testing::Values(0, 1000),    // iterStatsMaxIterations
+        testing::Values(false, true) // useOrchestratorMode
+        ),
+    generateTestNameStats);

 INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, ParamCancelReqTest,
-    testing::Combine(testing::Values(false, true), testing::Values(false, true), testing::Values(1, 2),
-        testing::Values(1, 2), testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1")),
+    testing::Combine(                                                                  //
+        testing::Values(false, true),                                                  // useOrchestratorMode
+        testing::Values(1, 2),                                                         // beamWidth
+        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
+        ),
    generateTestNameCancelReq);

 INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, TimeoutTest,
-    testing::Combine(testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp1_pp1_cp1"),
-        testing::Values(false, true), testing::Values(2)),
+    testing::Combine(                                                                   //
+        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp1_pp1_cp1"), // modelName
+        testing::Values(false, true),                                                   // useOrchestratorMode
+        testing::Values(2)                                                              // beamWidth
+        ),
    generateTestNameTimeoutTest);

 INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LeaderApiUsageTest,
-    testing::Combine(
-        testing::Values(false, true), testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1")),
+    testing::Combine(                                                                  //
+        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
+        ),
    generateTestNameLeaderApiUsage);

 INSTANTIATE_TEST_SUITE_P(GptExecutorTest, AllParamsTest,
    testing::Combine(                 //
        testing::Values(false, true), // streaming
        testing::Values(1, 2),        // beamWidth
-        testing::Values(false, true), // computeLogProbs
+        testing::Values(true),        // computeLogProbs
        testing::Values(false, true), // excludeInputInOutput
-        testing::Values(false, true), // returnContextLogits
-        testing::Values(false, true), // returnGenerationLogits
+        testing::Values(true),        // returnContextLogits
+        testing::Values(true),        // returnGenerationLogits
        testing::Values("gpt"),       // modelName
        testing::Values(false, true), // useOrchestratorMode
        testing::Values(false, true), // returnAllGeneratedTokens
@ -4490,10 +4526,10 @@ INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, AllParamsTest,
    testing::Combine(                                                                   //
        testing::Values(false, true),                                                   // streaming
        testing::Values(1, 2),                                                          // beamWidth
-        testing::Values(false, true),                                                   // computeLogProbs
+        testing::Values(false),                                                         // computeLogProbs
        testing::Values(false, true),                                                   // excludeInputInOutput
-        testing::Values(false, true),                                                   // returnContextLogits
-        testing::Values(false, true),                                                   // returnGenerationLogits
+        testing::Values(false),                                                         // returnContextLogits
+        testing::Values(false),                                                         // returnGenerationLogits
        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1"), // modelName
        testing::Values(false, true),                                                   // useOrchestratorMode
        testing::Values(false),                                                         // returnAllGeneratedTokens
@ -4594,9 +4630,12 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm3ExecutorTest, AllParamsTest,
    generateTestNameAllParams);

 INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LogitsProcParamsTest,
-    testing::Combine(
-        testing::Values("llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1"),
-        testing::Values(false, true), testing::Values(false, true)),
+    testing::Combine(                                                                            //
+        testing::Values(
+            "llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1"), // modelName
+        testing::Values(false, true),                                                            // batched
+        testing::Values(false, true)                                                             // replicated
+        ),
    generateTestNameLogitsProc);

 INSTANTIATE_TEST_SUITE_P(GptExecutorGuidedDecodingTest, GuidedDecodingParamsTest,
--- a/cpp/tests/utils/common.cpp
+++ b/cpp/tests/utils/common.cpp
@ -408,9 +408,9 @@ TestData TestData::loadTestData(BeamResult const& beamResults, ITensor const& gi
 }

 void TestData::verifyOutput(std::unordered_map<SizeType32, std::vector<executor::BeamTokens>> const& resultTokens,
-    std::vector<SizeType32> const& givenInputLengths, SizeType32 nbGivenInputs, bool streaming,
-    bool excludeInputFromOutput, FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding, bool returnAllGeneratedTokens,
-    SizeType32 reqBeamWidth, SizeType32 numReturnSequences, bool isNonGreedySampling)
+    std::vector<SizeType32> const& givenInputLengths, bool streaming, bool excludeInputFromOutput,
+    FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding, SizeType32 reqBeamWidth, SizeType32 numReturnSequences,
+    bool isNonGreedySampling)
 {
    for (auto const& [batchId, beamTokens] : resultTokens)
    {
@ -436,12 +436,6 @@ void TestData::verifyOutput(std::unordered_map<SizeType32, std::vector<executor:
                    = expectedOutputLengths[batchId * reqBeamWidth + beam]; // Ground truth output length
                auto expectedOutputLength
                    = expectInputOutputLength - inputLength;                // Number of new generated output tokens
-                if (returnAllGeneratedTokens)
-                {
-                    // If returnAllGeneratedTokens, then the tokens of each iteration will contain all the previously
-                    // generated tokens. Such as: [(a), (a, b), (a, b, c), (a, b, c, d), ...]
-                    expectedOutputLength = (1 + expectedOutputLength) * expectedOutputLength / 2;
-                }

                bool inputNotIncluded = (streaming || excludeInputFromOutput);
                bool anyMismatch = false;
@ -458,12 +452,6 @@ void TestData::verifyOutput(std::unordered_map<SizeType32, std::vector<executor:
                        << "b: " << batchId << " seq: " << seqIdx << " beam: " << beam;
                }

-                if (returnAllGeneratedTokens)
-                {
-                    // If returnAllGeneratedTokens, the output of the last iteration will contain all the output tokens
-                    predictedTokens.erase(
-                        predictedTokens.begin(), predictedTokens.end() - (expectInputOutputLength - inputLength));
-                }
                auto numPredTokens = static_cast<SizeType32>(predictedTokens.size());

                if (isSpeculativeDecoding)
@ -631,7 +619,7 @@ void TestData::validateGenerationLogits(bool getGenLogits, bool isFinal, bool st
            SizeType32 numGeneratedToken = genLogits.value().getShape()[0];
            if (returnAllGeneratedTokens)
            {
-                EXPECT_EQ((numGeneratedToken + 1) * numGeneratedToken / 2, numPredTokens);
+                EXPECT_EQ(numGeneratedToken, numPredTokens);
            }
            else
            {
--- a/cpp/tests/utils/common.h
+++ b/cpp/tests/utils/common.h
@ -187,9 +187,8 @@ public:
        tr::BufferManager& manager, executor::OutputConfig const& outConfig, ModelIds const& modelIds);

    void verifyOutput(std::unordered_map<SizeType32, std::vector<executor::BeamTokens>> const& resultTokens,
-        std::vector<SizeType32> const& givenInputLengths, SizeType32 nbGivenInputs, bool streaming,
-        bool excludeInputFromOutput, FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding,
-        bool returnAllGeneratedTokens, SizeType32 reqBeamWidth, SizeType32 numReturnSequences,
+        std::vector<SizeType32> const& givenInputLengths, bool streaming, bool excludeInputFromOutput,
+        FlakyTestInfo flakyTestInfo, bool isSpeculativeDecoding, SizeType32 reqBeamWidth, SizeType32 numReturnSequences,
        bool isNonGreedySampling);

    void verifyLogProbs(bool computeLogProbs, bool streaming, bool excludeInputFromOutput, SizeType32 inputLength,
--- a/tests/integration/defs/cpp/test_multi_gpu.py
+++ b/tests/integration/defs/cpp/test_multi_gpu.py
@ -478,12 +478,7 @@ def test_enc_dec(build_google_tests, multi_gpu_model, build_dir):

@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                         indirect=True)
-@pytest.mark.parametrize("mode", [
-    "orchestrator",
-    pytest.param(
-        "leader",
-        marks=pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5026255"))
-])
+@pytest.mark.parametrize("mode", ["orchestrator", "leader"])
@pytest.mark.parametrize("multi_gpu_model", ["llama"], indirect=True)
 def test_llama_executor(build_google_tests, multi_gpu_model, mode, lora_setup,
                        build_dir):