TensorRT-LLMs/cpp/tests/executor/executorTest.cpp

/*
 * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
 *
 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 * property and proprietary rights in and to this material, related
 * documentation and any modifications thereto. Any use, reproduction,
 * disclosure or distribution of this material and related documentation
 * without an express license agreement from NVIDIA CORPORATION or
 * its affiliates is strictly prohibited.
 */

#ifndef TOP_LEVEL_DIR
#error "Define TOP_LEVEL_DIR"
#endif

#include "executorTest.h"

#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/executor/dataTransceiverState.h"
#include "tensorrt_llm/executor/requestWithId.h"
#include "tensorrt_llm/executor/types.h"
#include "tensorrt_llm/executor/version.h"
#include "tensorrt_llm/runtime/gptJsonConfig.h"
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
#include "tensorrt_llm/runtime/utils/numpyUtils.h"
#include "tensorrt_llm/testing/modelSpec.h"
#include "tests/utils/common.h"

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <nlohmann/json.hpp>

#include <algorithm>
#include <chrono>
#include <cstddef>
#include <functional>
#include <memory>
#include <string>
#include <thread>
#include <vector>

namespace tr = tensorrt_llm::runtime;
namespace tc = tensorrt_llm::common;

using namespace tensorrt_llm::testing;
using namespace tensorrt_llm::executor;
using namespace std::chrono_literals;
namespace fs = std::filesystem;
using tensorrt_llm::testing::KVCacheType;
using tensorrt_llm::testing::ModelSpec;

namespace
{

auto const LORA_DATA_PATH = DATA_PATH / "lora-test-weights-gpt2-tp1";
auto const LORA_WEIGHTS_FILE = LORA_DATA_PATH / "source.npy";
auto const LORA_CONFIG_FILE = LORA_DATA_PATH / "config.npy";

auto constexpr LLAMA_INPUT_FILE = "input_tokens_llama.npy";
auto constexpr LLAMA_VOCAB_SIZE_PADDED = 128256;
auto constexpr LLAMA_PAD_ID = 128001;
auto constexpr LLAMA_END_ID = 128001;

} // namespace

void testInvalidCtor(std::filesystem::path const& enginePath, ModelType modelType, ExecutorConfig executorConfig,
    std::string expectedErrMsg = "")
{
    try
    {
        auto executor = Executor(enginePath, modelType, executorConfig);

        FAIL() << "Expected TllmException";
    }
    catch (std::exception const& e)
    {
        EXPECT_THAT(e.what(), testing::HasSubstr(expectedErrMsg));
    }
}

TEST_F(GptExecutorTest, version)
{
    EXPECT_STRNE(kTensorRtLlmVersion, "@TRTLLM_VERSION@");
    EXPECT_STREQ(kTensorRtLlmVersion, version());
}

TEST_F(GptExecutorTest, validCtor)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
}

TEST_F(GptExecutorTest, invalidCtor)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    std::filesystem::path invalidPath{"Bla"};

    // Invalid path
    {
        testInvalidCtor(invalidPath, ModelType::kDECODER_ONLY, executorConfig, "File does not exist");
    }
}

TEST_F(GptExecutorTest, enqueueAfterShutdown)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
    auto requestId = executor.enqueueRequest(request);

    bool done = false;
    int iter = 0;
    while (!done && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(requestId, waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                FAIL();
            }
            else
            {
                done = response.getResult().isFinal;
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    executor.shutdown();

    EXPECT_FALSE(executor.canEnqueueRequests());

    std::string expErrMsg{"Shutdown called"};
    EXPECT_THAT([&]() { auto reqId = executor.enqueueRequest(request); },
        testing::Throws<tensorrt_llm::common::TllmException>(
            testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
    EXPECT_THAT([&]() { auto resp = executor.awaitResponses(); },
        testing::Throws<tensorrt_llm::common::TllmException>(
            testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
    EXPECT_THAT([&]() { auto stats = executor.getLatestIterationStats(); },
        testing::Throws<tensorrt_llm::common::TllmException>(
            testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
    EXPECT_THAT([&]() { auto stats = executor.getLatestRequestStats(); },
        testing::Throws<tensorrt_llm::common::TllmException>(
            testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
    EXPECT_THAT([&]() { executor.cancelRequest(requestId); },
        testing::Throws<tensorrt_llm::common::TllmException>(
            testing::Property(&tensorrt_llm::common::TllmException::what, testing::HasSubstr(expErrMsg))));
}

TEST_F(GptExecutorTest, missingPeftTask)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_LORA_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
    auto loraConfig = LoraConfig{10};
    request.setLoraConfig(loraConfig);

    auto requestId = executor.enqueueRequest(request);

    bool done = false;
    std::chrono::milliseconds waitTime(mMaxWaitMs);
    auto responses = executor.awaitResponses(requestId, waitTime);
    for (auto& response : responses)
    {
        if (response.hasError())
        {
            auto err = response.getErrorMsg();
            EXPECT_EQ(err, std::string("LoRA task 10 not found in cache. Please send LoRA weights with request"));
            done = true;
        }
        else
        {
            FAIL() << "Expects error due to missing Lora weights";
        }
    }
    EXPECT_TRUE(done);
}

TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits)
{
    SizeType32 constexpr beamWidth{1};
    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded

    // Create executor config
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setGatherGenerationLogits(true);

    // Enable kv cache reuse of executorConfig
    bool enableBlockReuse = true;
    FloatType freeGpuMemoryFraction = 0.4;
    auto kvCacheConfig
        = KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
    executorConfig.setKvCacheConfig(kvCacheConfig);

    // Create executor
    auto trtEnginePath
        = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DRAFT_TOKENS_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create request
    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4, 5, 6, 7, 8};

    std::vector<bool> streamingOptions{false, true};

    for (auto streaming : streamingOptions)
    {
        auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth));

        // Set draft tokens
        auto draftTokens = VecTokens{9, 10, 11, 12, 13}; // draft tokens
        auto draftLength = draftTokens.size();
        FloatType const acceptanceThreshold = 0.00001f;  // Ensure the draft token can be accepted
        auto externalDraftTokensConfig = ExternalDraftTokensConfig(draftTokens, std::nullopt, acceptanceThreshold);
        request.setExternalDraftTokensConfig(externalDraftTokensConfig);

        // Set return accepted token logits for this request
        OutputConfig outConfig;
        outConfig.returnGenerationLogits = true;
        request.setOutputConfig(outConfig);

        // Enqueue this request
        auto requestId = executor.enqueueRequest(request);

        bool done = false;
        int iter = 0;
        while (!done && iter < 5000)
        {
            std::chrono::milliseconds waitTime(mMaxWaitMs);
            auto responses = executor.awaitResponses(requestId, waitTime);
            for (auto& response : responses)
            {
                if (response.hasError())
                {
                    FAIL();
                }
                else
                {
                    auto result = response.getResult();
                    done = result.isFinal;
                    auto& genLogits = result.generationLogits;
                    EXPECT_TRUE(genLogits.has_value());

                    // Expected shape: (1, numAcceptedDraftToken, vocabSizePadded)
                    auto const& acceptedTokenLogitsShape = genLogits->getShape();
                    EXPECT_EQ(acceptedTokenLogitsShape.size(), 3);
                    EXPECT_EQ(acceptedTokenLogitsShape[0], 1);
                    EXPECT_LE(acceptedTokenLogitsShape[1], draftLength);     // number of accepted tokens
                    EXPECT_EQ(acceptedTokenLogitsShape[2], vocabSizePadded); // vocabSizePadded
                }
            }
            ++iter;
        }
    }
}

TEST_F(GptExecutorTest, GenerationLogitsEarlyStop)
{
    SizeType32 constexpr beamWidth{1};
    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
    auto constexpr streaming = false;

    ExtendedRuntimePerfKnobConfig perfKnobConfig = ExtendedRuntimePerfKnobConfig();

    // Create executor config
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setExtendedRuntimePerfKnobConfig(perfKnobConfig);
    executorConfig.setGatherGenerationLogits(true);

    // Create executor
    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    auto const inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};

    auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
    auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
    auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
    auto const* const givenInputData = tr::bufferCast<TokenIdType const>(*givenInput);

    auto const& inputShape = givenInput->getShape();
    ASSERT_EQ(inputShape.nbDims, 2);
    ASSERT_GT(inputShape.d[0], 0);

    BeamResult beamResult{beamWidth};
    auto const resultsPath
        = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
    beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
    beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
    beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();

    // Set return generation logits for this request
    OutputConfig outConfig;
    outConfig.returnGenerationLogits = true;
    outConfig.excludeInputFromOutput = true;

    // Load expected outputs for each beam width value
    auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds);
    auto const maxSeqLen = testData.maxSeqLen;

    // Load expected outputs and inputs
    std::vector<Request> requests;
    std::vector<SizeType32> reqMaxNewTokens;

    auto constexpr reqIdx = 0;
    SizeType32 inputLen = givenInputLengths.at(reqIdx);
    auto maxNewTokens = maxSeqLen - maxInputLength;
    reqMaxNewTokens.push_back(maxNewTokens);
    auto const* const seqBegin = givenInputData + reqIdx * maxInputLength;

    auto request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming,
        tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId);
    // copy request
    auto request2 = request;

    auto const expectedOutputData = tr::BufferRange<TokenIdType const>(*testData.expectedOutputIds);
    auto const expectedOutputLengths = testData.expectedOutputLengths;
    auto const endPos = expectedOutputLengths[reqIdx] - 3;
    auto const endIndex = tc::flat_index3(reqIdx, beamWidth - 1, endPos, beamWidth, maxSeqLen);
    auto const endToken = expectedOutputData[endIndex];

    // Set end id to stop early
    request.setEndId(endToken);
    requests.emplace_back(std::move(request));

    // Set stop words to stop early
    request2.setStopWords({{endToken}});
    requests.emplace_back(std::move(request2));

    // Enqueue requests
    auto requestIds = executor.enqueueRequests(requests);

    std::map<IdType, SizeType32> expectedNewTokens;
    expectedNewTokens[requestIds.at(0)] = endPos - inputLen;
    expectedNewTokens[requestIds.at(1)] = endPos - inputLen + 1;

    std::map<IdType, FinishReason> expectedFinishReason;
    expectedFinishReason[requestIds.at(0)] = FinishReason::kEND_ID;
    expectedFinishReason[requestIds.at(1)] = FinishReason::kSTOP_WORDS;

    std::map<IdType, bool> done;
    std::for_each(requestIds.begin(), requestIds.end(), [&done](auto id) { done[id] = false; });
    int iter = 0;
    while (!(std::all_of(done.begin(), done.end(), [](auto x) { return x.second; })) && iter < 5000)
    {
        std::chrono::milliseconds waitTime(mMaxWaitMs);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                FAIL();
            }
            else
            {
                auto const reqId = response.getRequestId();
                auto const& result = response.getResult();
                EXPECT_TRUE(result.isFinal);
                done.at(reqId) = result.isFinal;

                // only 1 beam
                auto const& outputIds = result.outputTokenIds.at(0);
                EXPECT_EQ(outputIds.size(), expectedNewTokens.at(reqId)) << "req " << reqId;

                auto const& finishReason = result.finishReasons.at(0);
                EXPECT_EQ(finishReason, expectedFinishReason.at(reqId)) << "req " << reqId;

                auto const& genLogits = result.generationLogits;
                EXPECT_TRUE(genLogits.has_value());

                // Expected shape: (1, numAcceptedDraftToken, vocabSizePadded)
                auto const& generationLogitsShape = genLogits->getShape();
                EXPECT_EQ(generationLogitsShape.size(), 3);
                EXPECT_EQ(generationLogitsShape[0], 1);
                EXPECT_LE(generationLogitsShape[1], maxNewTokens);
                EXPECT_EQ(generationLogitsShape[2], vocabSizePadded);

                auto const genLogitsTensor = detail::toITensor(*genLogits);
                genLogitsTensor->squeeze(0); // only 1 beam

                for (size_t outputIdx = 0; outputIdx < expectedNewTokens.at(reqId); ++outputIdx)
                {
                    // logits argmax should be equal to tokenId
                    auto const genLogitsSlice = tr::ITensor::slice(genLogitsTensor, outputIdx, 1);
                    auto const genLogitsRange = tr::BufferRange<float>(*genLogitsSlice);
                    auto const* maxPos = std::max_element(genLogitsRange.begin(), genLogitsRange.end());
                    auto const maxIdx = std::distance(genLogitsRange.begin(), maxPos);

                    auto const tokenId = outputIds.at(outputIdx);
                    // Observed token mismatch at index 2 after building GPT engine with TRT builder optimization
                    // level 3. The testcase is sensitive to slight variation in kernel computation, so we skip checking
                    // for token id at index 2.
                    if (outputIdx != 2)
                    {
                        EXPECT_EQ(tokenId, maxIdx) << "req " << reqId << " outputIdx " << outputIdx;
                    }
                }
            }
        }
        ++iter;
    }
}

TEST_F(GptExecutorTest, GenerationChangeEndId)
{
    SizeType32 constexpr beamWidth{2};
    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
    auto constexpr streaming = false;

    ExtendedRuntimePerfKnobConfig perfKnobConfig = ExtendedRuntimePerfKnobConfig();
    perfKnobConfig.setEnableContextFMHAFP32Acc(true); // use fmha fp32 acc for better accuracy

    // Create executor config
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setExtendedRuntimePerfKnobConfig(perfKnobConfig);

    // Create executor
    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    auto const inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};

    auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
    auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
    auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
    auto const* const givenInputData = tr::bufferCast<TokenIdType const>(*givenInput);

    auto const& inputShape = givenInput->getShape();
    ASSERT_EQ(inputShape.nbDims, 2);
    ASSERT_GT(inputShape.d[0], 0);

    BeamResult beamResult{beamWidth};
    auto const resultsPath
        = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
    beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_CONTEXTFMHAFP32ACC_RESULT_FILE();

    // Just return tokens for check
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = true;

    // Load expected outputs for each beam width value
    auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds);
    auto const maxSeqLen = testData.maxSeqLen;

    // Load expected outputs and inputs
    std::vector<Request> requests;
    std::vector<SizeType32> reqMaxNewTokens;

    // Only use the first request to test
    auto constexpr reqIdx = 0;
    SizeType32 inputLen = givenInputLengths.at(reqIdx);
    auto maxNewTokens = maxSeqLen - maxInputLength;
    reqMaxNewTokens.push_back(maxNewTokens);
    auto const* const seqBegin = givenInputData + reqIdx * maxInputLength;

    // Use customized `EndId` to enqueue once
    auto request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming,
        tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId);

    TokenIdType customizedEndId = *(seqBegin + 1); // Use a token appeared in ground-truth
    request.setEndId(customizedEndId);
    requests.emplace_back(std::move(request));

    auto requestIds = executor.enqueueRequests(requests);
    std::chrono::milliseconds waitTime(mMaxWaitMs);
    auto responses = executor.awaitResponses(waitTime);
    if (responses.at(0).hasError())
    {
        FAIL();
    }
    requests.clear();

    // Change back to default `EndId` to enqueue again, and check the output
    request = Request(VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming,
        tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, modelIds.endId);

    auto const expectedOutputData = tr::BufferRange<TokenIdType const>(*testData.expectedOutputIds);
    auto const expectedOutputLengths = testData.expectedOutputLengths;
    auto const endPos = expectedOutputLengths[reqIdx];
    auto const endIndex = tc::flat_index3(reqIdx, beamWidth, endPos, beamWidth, maxSeqLen);
    auto const endToken = expectedOutputData[endIndex];

    request.setEndId(endToken);
    requests.emplace_back(std::move(request));
    requestIds = executor.enqueueRequests(requests);
    auto const requestId = requestIds.at(0);

    std::map<IdType, SizeType32> expectedNewTokens;
    expectedNewTokens[requestId] = endPos - inputLen;

    std::map<IdType, FinishReason> expectedFinishReason;
    expectedFinishReason[requestId] = FinishReason::kLENGTH;

    std::map<IdType, bool> done;
    std::for_each(requestIds.begin(), requestIds.end(), [&done](auto id) { done[id] = false; });
    int iter = 0;
    while (!(std::all_of(done.begin(), done.end(), [](auto x) { return x.second; })) && iter < 5000)
    {
        std::chrono::milliseconds waitTime(mMaxWaitMs);
        auto responses = executor.awaitResponses(waitTime);
        auto& response = responses.at(0);
        if (response.hasError())
        {
            FAIL();
        }
        else
        {
            auto const reqId = response.getRequestId();
            auto const& result = response.getResult();
            EXPECT_TRUE(result.isFinal);
            done.at(reqId) = result.isFinal;

            bool anyMismatch = false;
            for (int i = 0; i < result.outputTokenIds.size(); ++i)
            {
                auto const& outputIds = result.outputTokenIds.at(i);
                EXPECT_EQ(outputIds.size(), expectedNewTokens.at(reqId)) << "req " << reqId;
                anyMismatch |= outputIds.size() != expectedNewTokens.at(reqId);

                auto const& finishReason = result.finishReasons.at(i);
                EXPECT_EQ(finishReason, expectedFinishReason.at(reqId)) << "req " << reqId;
                anyMismatch |= finishReason != expectedFinishReason.at(reqId);

                if (anyMismatch)
                {
                    break;
                }

                for (int j = 0; j < outputIds.size(); ++j)
                {
                    auto const resultToken = outputIds[j];
                    auto const groundTruthToken = expectedOutputData[maxSeqLen * i + inputLen + j];
                    EXPECT_EQ(resultToken, groundTruthToken);
                    anyMismatch |= resultToken != groundTruthToken;
                }
            }
            EXPECT_FALSE(anyMismatch);
        }
        ++iter;
    }
}

// stream, excludeInputFromOutput, beamWidth
using ParamType = std::tuple<bool, bool, int>;
// useOrchestratorMode, beamWidth, modelName
using ParamCancelReqType = std::tuple<bool, int, std::string>;
// modelName
using LeaderApiUsageType = std::tuple<std::string>;
// iterStatsMaxIterations, useOrchestratorMode
using ParamStatsType = std::tuple<int, bool>;
// streaming, beamWidth, computeLogProbs, excludeInputInOutput, returnContextLogits, returnGenerationLogits, modelName,
// useOrchestratorMode, returnAllGeneratedTokens, numReturnSequences
using AllParamsType = std::tuple<bool, int, bool, bool, bool, bool, std::string, bool, bool, int>;
// modelName, batched, replicated
using LogitsProcParamsType = std::tuple<std::string, bool, bool>;
// modelName
using GuidedDecodingParamsType = std::tuple<std::string>;
// modelName, useOrchestratorMode, beamWidth
using TimeoutTestParamsType = std::tuple<std::string, bool, int>;

std::string generateTestName(testing::TestParamInfo<ParamType> const& info)
{
    auto const streaming = std::get<0>(info.param);
    auto const excludeInputFromOutput = std::get<1>(info.param);
    auto const beamWidth = std::get<2>(info.param);
    std::string name = "ExecutorTest";
    if (streaming)
    {
        name += "Streaming";
    }
    if (excludeInputFromOutput)
    {
        name += "ExclInput";
    }
    name.append("BW" + std::to_string(beamWidth));
    return name;
}

std::string generateTestNameCancelReq(testing::TestParamInfo<ParamCancelReqType> const& info)
{
    auto const& useOrchestratorMode = std::get<0>(info.param);
    auto const beamWidth = std::get<1>(info.param);
    auto const modelName = std::get<2>(info.param);
    std::string name = "ExecutorTest";
    name.append("BW" + std::to_string(beamWidth));
    name.append("_" + modelName + "_");

    if (useOrchestratorMode)
    {
        name.append("OrchMode");
    }
    else
    {
        name.append("LeaderMode");
    }
    return name;
}

std::string generateTestNameLeaderApiUsage(testing::TestParamInfo<LeaderApiUsageType> const& info)
{
    auto const modelName = std::get<0>(info.param);
    std::string name = "ExecutorTest";
    name.append("_" + modelName);
    return name;
}

std::string generateTestNameLogitsProc(testing::TestParamInfo<LogitsProcParamsType> const& info)
{
    auto const modelName = std::get<0>(info.param);
    bool const batched = std::get<1>(info.param);
    bool const replicated = std::get<2>(info.param);
    std::string name = "ExecutorTest";
    name.append("_" + modelName);
    if (batched)
    {
        name.append("_Batched");
    }
    if (replicated)
    {
        name.append("_Replicated");
    }
    return name;
}

std::string generateTestNameGuidedDecoding(testing::TestParamInfo<GuidedDecodingParamsType> const& info)
{
    auto const modelName = std::get<0>(info.param);
    std::string name = "ExecutorTest";
    name.append("_" + modelName);
    return name;
}

std::string generateTestNameTimeoutTest(testing::TestParamInfo<TimeoutTestParamsType> const& info)
{
    auto const modelName = std::get<0>(info.param);
    auto const& useOrchestratorMode = std::get<1>(info.param);
    auto const beamWidth = std::get<2>(info.param);

    std::string name = "ExecutorTest";
    name.append("_" + modelName);

    if (useOrchestratorMode)
    {
        name.append("_OrchMode");
    }
    else
    {
        name.append("_LeaderMode");
    }
    name.append("_BW" + std::to_string(beamWidth));
    return name;
}

std::string generateTestNameStats(testing::TestParamInfo<ParamStatsType> const& info)
{
    int iterStatsMaxIterations = std::get<0>(info.param);
    auto const& useOrchestratorMode = std::get<1>(info.param);
    std::string name = "ExecutorTest_";
    name.append(std::to_string(iterStatsMaxIterations) + "_");
    if (useOrchestratorMode)
    {
        name.append("OrchMode");
    }
    else
    {
        name.append("LeaderMode");
    }
    return name;
}

std::string generateTestNameAllParams(testing::TestParamInfo<AllParamsType> const& info)
{
    auto const streaming = std::get<0>(info.param);
    auto const& beamWidth = std::get<1>(info.param);
    auto const& computeLogProbs = std::get<2>(info.param);
    auto const& excludeInputInOutput = std::get<3>(info.param);
    auto const& returnContextLogits = std::get<4>(info.param);
    auto const& returnGenerationLogits = std::get<5>(info.param);
    auto const modelName = std::get<6>(info.param);
    auto const& useOrchestratorMode = std::get<7>(info.param);
    auto const& returnAllGeneratedTokens = std::get<8>(info.param);
    auto const& numReturnSequences = std::get<9>(info.param);

    std::string name = "ExecutorTest_";

    if (streaming)
    {
        name += "Streaming";
    }

    name.append("_BW" + std::to_string(beamWidth));
    name.append("Nseq" + std::to_string(numReturnSequences));

    if (computeLogProbs)
    {
        name.append("LogProbs");
    }
    if (excludeInputInOutput)
    {
        name.append("ExcludeInput");
    }
    if (returnContextLogits)
    {
        name.append("ContextLogits");
    }
    if (returnGenerationLogits)
    {
        name.append("GenerationLogits");
    }
    name.append("_" + modelName + "_");
    if (useOrchestratorMode)
    {
        name.append("OrchMode");
    }
    else
    {
        name.append("LeaderMode");
    }

    if (returnAllGeneratedTokens)
    {
        name.append("returnAllGeneratedTokens");
    }
    return name;
}

class ParamTest : public GptExecutorTest, public ::testing::WithParamInterface<ParamType>
{
};

class ParamStatsTest : public GptExecutorTest, public ::testing::WithParamInterface<ParamStatsType>
{
};

class AllParamsTest : public GptExecutorTest, public ::testing::WithParamInterface<AllParamsType>
{
};

class ParamCancelReqTest : public GptExecutorTest, public ::testing::WithParamInterface<ParamCancelReqType>
{
};

class LeaderApiUsageTest : public GptExecutorTest, public ::testing::WithParamInterface<LeaderApiUsageType>
{
};

class LogitsProcParamsTest : public GptExecutorTest, public ::testing::WithParamInterface<LogitsProcParamsType>
{
};

class GuidedDecodingParamsTest : public GptExecutorTest, public ::testing::WithParamInterface<GuidedDecodingParamsType>
{
};

class TimeoutTest : public GptExecutorTest, public ::testing::WithParamInterface<TimeoutTestParamsType>
{
};

TEST_F(GptExecutorTest, GetLatestStats)
{
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request
        = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
    auto requestId = executor.enqueueRequest(std::move(request));

    bool done = false;
    int iter = 0;
    while (!done && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(requestId, waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                FAIL();
            }
            else
            {
                done = response.getResult().isFinal;
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    // Expect 6 non-empty iterations
    auto stats = executor.getLatestIterationStats();
    EXPECT_EQ(stats.size(), 6);
    uint64_t currentIter = 0;
    for (auto const& stat : stats)
    {
        EXPECT_EQ(stat.timestamp.size(), 26);
        EXPECT_EQ(stat.iter, currentIter);
        if (currentIter != 5)
        {
            EXPECT_EQ(stat.numActiveRequests, 1);
        }
        else
        {
            // For the last iteration the number of active requests
            // should be zero.
            EXPECT_EQ(stat.numActiveRequests, 0);
        }
        EXPECT_EQ(stat.maxNumActiveRequests, 64);
        // Very loose check to make sure the memory stats are valid
        EXPECT_GT(stat.gpuMemUsage, 16);
        EXPECT_GT(stat.cpuMemUsage, 16);
        EXPECT_GT(stat.pinnedMemUsage, 16);

        // Stats for KV cache
        EXPECT_TRUE(stat.kvCacheStats.has_value());
        KvCacheStats const& kvStats = stat.kvCacheStats.value();
        EXPECT_GT(kvStats.maxNumBlocks, 0);
        EXPECT_GT(kvStats.freeNumBlocks, 0);
        EXPECT_EQ(kvStats.usedNumBlocks, currentIter == maxNewTokens ? 0 : 1);
        EXPECT_GT(kvStats.tokensPerBlock, 0);
        EXPECT_GT(kvStats.allocTotalBlocks, 0);
        EXPECT_GT(kvStats.allocNewBlocks, 0);
        EXPECT_GE(kvStats.reusedBlocks, 0);
        EXPECT_GE(kvStats.missedBlocks, 0);
        EXPECT_GE(kvStats.cacheHitRate, 0);

        // Stats for inflight batching
        EXPECT_TRUE(stat.inflightBatchingStats.has_value() && !stat.staticBatchingStats.has_value());
        InflightBatchingStats const& modelStats = stat.inflightBatchingStats.value();
        EXPECT_EQ(modelStats.numScheduledRequests, currentIter == maxNewTokens ? 0 : 1);
        EXPECT_EQ(modelStats.numContextRequests, currentIter == 0 ? 1 : 0);
        EXPECT_EQ(modelStats.numGenRequests, currentIter == 0 || currentIter == maxNewTokens ? 0 : 1);
        EXPECT_EQ(modelStats.numPausedRequests, 0);
        EXPECT_EQ(modelStats.numCtxTokens, currentIter == 0 ? inputTokens.size() : 0);
        EXPECT_EQ(modelStats.microBatchId, 0);
        EXPECT_NEAR(
            modelStats.avgNumDecodedTokensPerIter, currentIter == 0 || currentIter == maxNewTokens ? 0.f : 1.f, 1e-9f);

        auto jsonStr = JsonSerialization::toJsonStr(stat);
        EXPECT_THAT(jsonStr, testing::HasSubstr("\"iter\":" + std::to_string(currentIter)));
        EXPECT_THAT(jsonStr, testing::HasSubstr("\"staticBatchingStats\":null"));
        EXPECT_THAT(jsonStr, testing::HasSubstr("\"numCtxTokens\":" + std::to_string(modelStats.numCtxTokens)));
        EXPECT_THAT(jsonStr, testing::HasSubstr("\"numGenRequests\":" + std::to_string(modelStats.numGenRequests)));

        ++currentIter;
    }
}

TEST_F(GptExecutorTest, GetLatestStatsWithMultipleRequests)
{
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the requests
    SizeType32 const numRequests = 2;
    std::vector<SizeType32> maxNewTokens{3, 5};
    std::vector<VecTokens> inputTokens{{1, 2, 3, 4}, {5, 6, 7}};
    std::vector<IdType> reqIds;
    for (SizeType32 ireq = 0; ireq < numRequests; ++ireq)
    {
        auto request = Request(inputTokens[ireq], maxNewTokens[ireq], streaming,
            tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
        auto requestId = executor.enqueueRequest(std::move(request));
        reqIds.emplace_back(requestId);
        // sleep for 10 ms before sending the next request
        std::this_thread::sleep_for(std::chrono::milliseconds(20));
    }

    for (SizeType32 ireq = 0; ireq < numRequests; ++ireq)
    {
        auto requestId = reqIds[ireq];
        bool done = false;
        int iter = 0;
        while (!done && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(requestId, waitTime);
            for (auto& response : responses)
            {
                if (response.hasError())
                {
                    FAIL();
                }
                else
                {
                    done = response.getResult().isFinal;
                }
            }
            ++iter;
        }
        EXPECT_LT(iter, mMaxWaitMs);
    }

    // NOTES:
    // Expect at least max(maxNewTokens) i.e. 5 non-empty iterations
    // 4th iteration should have numCompletedRequests to be 1.
    // Depending on the timing, first iteration will either have:
    //      2 active requests
    //      or
    //      1 active requests and 1 queued requests
    auto stats = executor.getLatestIterationStats();
    EXPECT_GT(stats.size(), 0); // make sure we have at least 1 stat before the accessing 0-th element
    if (stats[0].numActiveRequests == 2)
    {
        // we cannot reliably check queue latency since both started in the same iteration
        // there should be exactly 5 non-empty iterations
        EXPECT_EQ(stats.size(), 5);
        // only check numCompletedRequests in 4th iteration
        EXPECT_EQ(stats[3].numCompletedRequests, 1);
        // 1st iteration shall record all 2 requests queueing time;
        EXPECT_EQ(stats[0].numNewActiveRequests, 2);
        // all rest iterations shall not return any queueing time;
        for (int i = 1; i < stats.size(); ++i)
        {
            EXPECT_EQ(stats[i].numNewActiveRequests, 0);
        }
    }
    else
    {
        // there should be more than 5 non-empty iterations since 2nd request started after 1st iteration
        EXPECT_GT(stats.size(), 5);
        // 1st request's completion is at 4th iteration
        EXPECT_EQ(stats[3].numCompletedRequests, 1);
        // 1st iteration record 1 request's queueing time;
        EXPECT_EQ(stats[0].numNewActiveRequests, 1);
        // the iteration where 2nd request became active, queue latency must be > 0
        uint64_t currentIter = 0;
        for (auto const& stat : stats)
        {
            // To check when 2nd request becomes active, we need to think about 2 cases:
            //  - it overlaps with first request
            //      => only check queue time in this case
            //  - it doesn't overlap with the first request (e.g. 1st request ended too fast)
            //      => little to no queue time, cannot check reliably
            //  so we only check for queue time when numActiveRequests > 1 i.e. overlap happened after first iteration
            if (stat.numActiveRequests > 1)
            {
                EXPECT_GT(currentIter, 0); // it must be after 1st iteration
                EXPECT_GT(stat.newActiveRequestsQueueLatencyMS, 0);
                // 2nd request record queueing time in this iteration
                EXPECT_EQ(stat.numNewActiveRequests, 1);
                break;
            }
            ++currentIter;
        }
    }
}

TEST_F(GptExecutorTest, GetLatestRequestStats)
{
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setRequestStatsMaxIterations(1000);
    executorConfig.setEnableChunkedContext(true);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the requests
    std::vector<std::pair<SizeType32, VecTokens>> requestParams = {
        // {maxNewTokens, inputTokens}
        {5, {1, 2, 3, 4}}, {4, {1, 1, 2, 3, 5}}, {1, {1}},
        {8, VecTokens(383, 1)} // Long enough to be chunked into multiple iterations
    };
    std::vector<Request> requests;
    for (auto requestParam : requestParams)
    {
        requests.emplace_back(requestParam.second, requestParam.first, streaming,
            tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
    }
    auto requestIdsVec = executor.enqueueRequests(std::move(requests));
    std::map<IdType, SizeType32> requestIdToIndex;
    std::set<IdType> activeRequests;
    for (SizeType32 i = 0; i < requestIdsVec.size(); ++i)
    {
        auto requestId = requestIdsVec[i];
        activeRequests.insert(requestId);
        requestIdToIndex[requestId] = i;
    }

    int iter = 0;
    while (!activeRequests.empty() && iter < mMaxWaitMs)
    {
        for (auto i = activeRequests.begin(); i != activeRequests.end();)
        {
            auto requestId = *i;
            bool thisDone = false;
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(requestId, waitTime);
            for (auto& response : responses)
            {
                if (response.hasError())
                {
                    // Allow response with error only if awaitResponse processed a terminated request id
                    std::string err = "ReqId " + std::to_string(response.getRequestId())
                        + " has already been processed and was terminated.";
                    EXPECT_EQ(response.getErrorMsg(), err);
                }
                else
                {
                    thisDone = response.getResult().isFinal;
                }
            }
            if (thisDone)
            {
                // Erase completed request and move to the next one
                i = activeRequests.erase(i);
            }
            else
            {
                ++i;
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    // Expect 5 non-empty iterations
    // Note: The 6th iteration with the last finished request will be reported
    //       but might be unavailable when getLatestRequestStats is called since
    //       it could be updated after the final response has been sent.
    auto stats = executor.getLatestRequestStats();
    EXPECT_GE(stats.size(), 5);
    SizeType32 currentIter = 0;
    auto invalidStart = std::numeric_limits<SizeType32>::max();
    std::vector<SizeType32> genStart(requestParams.size(), invalidStart); // The iteration index when generation started
    std::set<IdType> completedRequests;
    for (auto stat = stats.begin(); stat != stats.begin() + 5; ++stat)
    {
        auto jsonStrIter = JsonSerialization::toJsonStr(*stat);
        EXPECT_EQ(stat->iter, currentIter);
        EXPECT_THAT(jsonStrIter, testing::HasSubstr("\"iter\":" + std::to_string(currentIter)));
        EXPECT_EQ(stat->requestStats.size() + completedRequests.size(), requestParams.size());
        for (auto rStat : stat->requestStats)
        {
            auto jsonStr = JsonSerialization::toJsonStr(rStat);
            // Only a few requests here so all of them should be scheduled. A separate test
            // GetLatestRequestStatsScheduling will target the scheduling stats.
            if (rStat.stage != RequestStage::kGENERATION_COMPLETE)
            {
                EXPECT_TRUE(rStat.scheduled);
                EXPECT_THAT(jsonStr, testing::HasSubstr("\"scheduled\":true"));
            }
            EXPECT_TRUE(!rStat.paused);
            EXPECT_THAT(jsonStr, testing::HasSubstr("\"paused\":false"));
            EXPECT_TRUE(requestIdToIndex.count(rStat.id));
            EXPECT_THAT(jsonStr, testing::HasSubstr("\"id\":" + std::to_string(rStat.id)));
            auto requestIndex = requestIdToIndex[rStat.id];
            auto contextSize = requestParams[requestIndex].second.size();
            if (rStat.contextPrefillPosition == contextSize) // Check generation phase
            {
                bool firstIteration{false};
                // Context phase is done
                EXPECT_TRUE(rStat.stage == RequestStage::kGENERATION_IN_PROGRESS
                    || rStat.stage == RequestStage::kGENERATION_COMPLETE);
                EXPECT_THAT(jsonStr, testing::HasSubstr("\"stage\":\"GENERATION"));
                if (genStart[requestIndex] == invalidStart)
                {
                    // Just started generation
                    genStart[requestIndex] = currentIter;
                    firstIteration = true;
                }

                // One token per iteration
                EXPECT_TRUE(currentIter - genStart[requestIndex] == rStat.numGeneratedTokens);
                EXPECT_NEAR(rStat.avgNumDecodedTokensPerIter, firstIteration ? 0.f : 1.0f, 1e-9);
                if (rStat.stage == RequestStage::kGENERATION_COMPLETE)
                {
                    EXPECT_TRUE(requestParams[requestIndex].first >= rStat.numGeneratedTokens);
                    completedRequests.insert(requestIndex);
                }
                else
                {
                    EXPECT_FALSE(completedRequests.count(requestIndex));
                }
            }
            else if (rStat.contextPrefillPosition < contextSize) // Check context phase
            {
                // Must be chunked
                SizeType32 const maxChunkSize = 128;
                EXPECT_TRUE(rStat.contextPrefillPosition % maxChunkSize == 0);
                // Context phase is on-going
                EXPECT_TRUE(rStat.stage == RequestStage::kCONTEXT_IN_PROGRESS);
                // No tokens are generated
                EXPECT_TRUE(0 == rStat.numGeneratedTokens);
            }
            else
            {
                FAIL() << "Out-of-boundary contextPrefillPosition in stats: " << rStat.contextPrefillPosition
                       << " out of " << contextSize;
            }
            // Sanity check that disaggregated serving stats is not set in typical use case
            EXPECT_FALSE(rStat.disServingStats.has_value());
        }
        ++currentIter;
    }
    // We should have visited all requests.
    // Take into consideration the last request has not been reported
    EXPECT_EQ(completedRequests.size() + 1, requestParams.size());
}

TEST_F(GptExecutorTest, GetLatestRequestStatsScheduling)
{
    // Specifically test the case where there are too many requests to be scheduled for a iteration
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setRequestStatsMaxIterations(1000);
    executorConfig.setEnableChunkedContext(true);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create 100 requests. Note the max batch size for this model is 64 so some requests won't be scheduled right away.
    std::vector<std::pair<SizeType32, VecTokens>> requestParams(100, {5, {1, 2, 3, 4}});
    std::vector<Request> requests;
    requests.reserve(requestParams.size());
    for (auto requestParam : requestParams)
    {
        requests.emplace_back(requestParam.second, requestParam.first, streaming,
            tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
    }
    auto requestIdsVec = executor.enqueueRequests(std::move(requests));
    std::map<IdType, SizeType32> requestIdToIndex;
    std::set<IdType> activeRequests;
    for (SizeType32 i = 0; i < requestIdsVec.size(); ++i)
    {
        auto requestId = requestIdsVec[i];
        activeRequests.insert(requestId);
        requestIdToIndex[requestId] = i;
    }

    int iter = 0;
    while (!activeRequests.empty() && iter < mMaxWaitMs)
    {
        for (auto i = activeRequests.begin(); i != activeRequests.end();)
        {
            auto requestId = *i;
            bool thisDone = false;
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(requestId, waitTime);
            for (auto& response : responses)
            {
                if (response.hasError())
                {
                    // Allow response with error only if awaitResponse processed a terminated request id
                    std::string err = "ReqId " + std::to_string(response.getRequestId())
                        + " has already been processed and was terminated.";
                    EXPECT_EQ(response.getErrorMsg(), err);
                }
                else
                {
                    thisDone = response.getResult().isFinal;
                }
            }
            if (thisDone)
            {
                // Erase completed request and move to the next one
                i = activeRequests.erase(i);
            }
            else
            {
                ++i;
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    auto stats = executor.getLatestRequestStats();
    SizeType32 numFinished = 0;
    SizeType32 const maxActiveSize = 64; // Decided by the model

    // The 6th iteration request stat may or may not be available when getLatestRequestStats
    // is called. When there are no other active or inTransmission requests, there will be
    // another request stats to properly reset all the statistics to zero.
    for (auto stat = stats.begin(); stat != stats.begin() + 5; ++stat)
    {
        SizeType32 numReqs = 0;
        SizeType32 numReqsActive = 0;
        SizeType32 numReqsQueued = 0;
        SizeType32 numReqsJustDone = 0;
        for (auto rStat : stat->requestStats)
        {
            ++numReqs;
            numReqsActive += rStat.scheduled ? 1 : 0;
            numReqsQueued += rStat.stage == RequestStage::kQUEUED ? 1 : 0;
            numReqsJustDone += rStat.stage == RequestStage::kGENERATION_COMPLETE ? 1 : 0;
        }
        EXPECT_EQ(numReqs, numReqsActive + numReqsQueued + numReqsJustDone);
        EXPECT_EQ(numReqs + numFinished, requestParams.size()); // Should report all unfinished requests
        EXPECT_TRUE(numReqsActive <= maxActiveSize); // Not all requests are active due to max active size limit.
        numFinished += numReqsJustDone;
    }
}

TEST_F(GptExecutorTest, GetRequestStatsMultipleRequests)
{
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setRequestStatsMaxIterations(1000);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    auto sendRequestWaitForResponseFn = [&]()
    {
        Request request({1, 2, 3}, 5);
        auto requestId = executor.enqueueRequest(request);
        bool isFinalResponse = false;
        while (!isFinalResponse)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(requestId, waitTime);
            for (auto response : responses)
            {
                if (response.getResult().isFinal)
                {
                    isFinalResponse = true;
                    break;
                }
            }
        }
        return requestId;
    };

    std::unordered_map<IdType, size_t> requestIdToGenerationComplete;
    auto updateStats = [&]()
    {
        auto stats = executor.getLatestRequestStats();
        for (auto& stat : stats)
        {
            for (auto const& request : stat.requestStats)
            {
                // only check and aggregate results when request is completed
                if (request.stage == RequestStage::kGENERATION_COMPLETE)
                {
                    requestIdToGenerationComplete[request.id] += 1;
                }
            }
        }
    };

    auto requestId = sendRequestWaitForResponseFn();
    requestIdToGenerationComplete[requestId] = 0;
    updateStats();

    requestId = sendRequestWaitForResponseFn();
    requestIdToGenerationComplete[requestId] = 0;
    updateStats();

    for (auto [key, value] : requestIdToGenerationComplete)
    {
        EXPECT_EQ(value, 1);
    }
}

TEST_F(GptExecutorTest, BatchSizeTuning)
{
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setRequestStatsMaxIterations(1000);
    executorConfig.setEnableChunkedContext(true);

    DynamicBatchConfig dynamicBatchConfig(true, false, 1); // Set window size to 1
    SchedulerConfig schedulerConfig(CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::nullopt, dynamicBatchConfig);
    executorConfig.setSchedulerConfig(schedulerConfig);

    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    std::vector<SizeType32> tunerRecommendedBatchSizes;

    for (size_t i = 0; i <= 8; ++i)
    {
        auto inputLength = 1 << i; // Note that for this model max input len is 383
        Request request(
            VecTokens(inputLength, 2), 5, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
        auto requestId = executor.enqueueRequest(std::move(request));
        // Wait for current request to finish
        while (true)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(requestId, waitTime);
            bool done = false;
            if (responses.size() != 0)
            {
                EXPECT_TRUE(responses.size() == 1);
                auto response = responses[0];
                EXPECT_FALSE(response.hasError());
                if (response.getResult().isFinal)
                {
                    break;
                }
            }
        }
        auto reqStats = executor.getLatestIterationStats();
        EXPECT_TRUE(reqStats.size() > 0);
        auto lastStat = reqStats.back();
        tunerRecommendedBatchSizes.push_back(lastStat.maxBatchSizeTunerRecommended);
    }

    EXPECT_TRUE(tunerRecommendedBatchSizes.size() > 0);
    // It's supposed to be decreasing when input length increases
    EXPECT_TRUE(*tunerRecommendedBatchSizes.begin() > *tunerRecommendedBatchSizes.rbegin());
}

TEST_F(GptExecutorTest, GetLatestDebugTensors)
{
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    SizeType32 maxNewTokens = 5;

    tensorrt_llm::executor::DebugConfig debugConfig;
    debugConfig.setDebugTensorNames({{"sequence_length"}});
    debugConfig.setDebugTensorsMaxIterations(maxNewTokens);

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setDebugConfig(debugConfig);

    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    VecTokens inputTokens{1, 2, 3, 4};
    auto request
        = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
    auto requestId = executor.enqueueRequest(request);

    bool done = false;
    int iter = 0;
    while (!done && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(requestId, waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                FAIL();
            }
            else
            {
                done = response.getResult().isFinal;
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    auto stream = std::make_shared<tr::CudaStream>();

    // Expect 5 non-empty iterations
    auto debugTensors = executor.getLatestDebugTensors();
    EXPECT_EQ(debugTensors.size(), 5);
    uint64_t currentIter = 0;
    for (auto const& debugIteration : debugTensors)
    {
        EXPECT_EQ(debugIteration.iter, currentIter);
        EXPECT_EQ(debugIteration.debugTensors.size(), 2);

        {
            auto it = debugIteration.debugTensors.find("request_ids");
            EXPECT_NE(it, debugIteration.debugTensors.end());
            auto const& tensor = it->second;
            auto const& shape = tensor.getShape();
            EXPECT_EQ(shape.size(), 1);
            EXPECT_EQ(shape[0], 1);
            EXPECT_EQ(tensor.getSize(), 1);
            auto const* dataPtr = static_cast<SizeType32 const*>(tensor.getData());
            EXPECT_EQ(dataPtr[0], 1) << "currentIter " << currentIter;
        }
        {
            auto it = debugIteration.debugTensors.find("sequence_length");
            EXPECT_NE(it, debugIteration.debugTensors.end());
            auto const& tensor = it->second;
            auto const& shape = tensor.getShape();
            EXPECT_EQ(shape.size(), 1);
            EXPECT_EQ(tensor.getSize(), 1);
            auto tensorHost = tensor.copyToCpu(stream);
            auto const* dataPtr = static_cast<SizeType32 const*>(tensorHost.getData());
            EXPECT_EQ(dataPtr[0], inputTokens.size() + currentIter);
        }

        ++currentIter;
    }
}

TEST_P(ParamTest, SingleRequestDemo)
{
    bool const streaming = std::get<0>(GetParam());
    bool const excludeInputFromOutput = std::get<1>(GetParam());
    auto const beamWidth = std::get<2>(GetParam());
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request
        = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);

    // Enqueue the request
    auto requestId = executor.enqueueRequest(request);

    // Get the new tokens
    VecTokens tokens;
    SizeType32 numResponses{0};
    bool done = false;
    int iter = 0;
    std::chrono::milliseconds waitTime(1);
    while (!done && iter < mMaxWaitMs)
    {
        auto responses = executor.awaitResponses(requestId, waitTime);
        for (auto& response : responses)
        {
            ++numResponses;
            if (response.hasError())
            {
                // This request failed for some reason, get error msg
                std::string errStr
                    = "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg();
                FAIL();
            }

            auto result = response.getResult();
            done = result.isFinal;
            auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
            auto const expectedSize = streaming ? (beamWidth > 1 ? numResponses : 1)
                                                : (maxNewTokens + (excludeInputFromOutput ? 0 : inputTokens.size()));
            EXPECT_EQ(newTokens.size(), expectedSize);

            if (streaming && beamWidth > 1)
            {
                // replace tokens
                tokens = newTokens;
            }
            else
            {
                // Append tokens
                tokens.insert(tokens.end(), newTokens.begin(), newTokens.end());
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);
    EXPECT_EQ(numResponses, streaming ? maxNewTokens : 1);
    EXPECT_EQ(
        tokens.size(), streaming ? maxNewTokens : (excludeInputFromOutput ? 0 : inputTokens.size()) + maxNewTokens);

    // Expect awaitResponse to return error message because the request is already terminated (isFinal = True)
    auto response = executor.awaitResponses(requestId, waitTime).at(0);
    EXPECT_TRUE(response.hasError());
    std::string err
        = "ReqId " + std::to_string(response.getRequestId()) + " has already been processed and was terminated.";
    EXPECT_EQ(response.getErrorMsg(), err);
}

TEST_P(ParamTest, MultipleRequestDemo)
{
    bool const streaming = std::get<0>(GetParam());
    bool const excludeInputFromOutput = std::get<1>(GetParam());
    auto const beamWidth = std::get<2>(GetParam());
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;
    SizeType32 numRequests = 20;

    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 maxPromptLen = 20;
    SizeType32 maxMaxNewTokens = 20;

    SizeType32 endId = -1;
    // Enqueue the requests
    std::unordered_map<IdType, VecTokens> tokens;
    std::unordered_map<IdType, SizeType32> expectedNumTokens;
    std::unordered_map<IdType, SizeType32> expectedNumResponses;
    for (SizeType32 req = 0; req < numRequests; ++req)
    {
        SizeType32 promptLen = rand() % maxPromptLen + 1;
        SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;

        auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
            tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
        auto reqId = executor.enqueueRequest(std::move(request));
        tokens[reqId] = {};
        expectedNumTokens[reqId] = ((streaming || excludeInputFromOutput) ? 0 : promptLen) + maxNewTokens;
        expectedNumResponses[reqId] = streaming ? maxNewTokens : 1;
    }

    // Get the new tokens for each requests
    int32_t numFinished = 0;
    int iter = 0;
    std::unordered_map<IdType, SizeType32> numResponses;
    while (numFinished < numRequests && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            auto reqId = response.getRequestId();
            ++numResponses[reqId];
            if (!response.hasError())
            {
                auto result = response.getResult();
                numFinished += result.isFinal;
                auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
                auto const expectedSize
                    = streaming ? (beamWidth > 1 ? numResponses[reqId] : 1) : expectedNumTokens[reqId];
                EXPECT_EQ(newTokens.size(), expectedSize);

                auto& reqTokens = tokens.at(response.getRequestId());
                if (streaming && beamWidth > 1)
                {
                    reqTokens = newTokens;
                }
                else
                {
                    reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
                }

                for (SizeType32 b = 0; b < beamWidth; ++b)
                {
                    EXPECT_EQ(result.finishReasons.at(b),
                        result.isFinal ? FinishReason::kLENGTH : FinishReason::kNOT_FINISHED);
                }
            }
            else
            {
                // Allow response with error only if awaitResponse processed a terminated request id
                std::string err = "ReqId " + std::to_string(response.getRequestId())
                    + " has already been processed and was terminated.";
                EXPECT_EQ(response.getErrorMsg(), err);
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    // Check that number of tokens matches expectations
    for (auto const& [reqId, numTokens] : expectedNumTokens)
    {
        EXPECT_EQ(expectedNumResponses[reqId], numResponses[reqId]) << "reqId " << reqId;
        EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
    }
}

TEST_P(ParamStatsTest, MultipleRequestStats)
{
    bool streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;
    SizeType32 numRequests = 100;
    auto iterStatsMaxIterations = std::get<0>(GetParam());
    bool useOrchestratorMode = std::get<1>(GetParam());

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setIterStatsMaxIterations(iterStatsMaxIterations);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";

    std::optional<OrchestratorConfig> orchestratorConfig = std::nullopt;
    if (useOrchestratorMode)
    {
        orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
    }
    auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
        useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt, std::nullopt,
        orchestratorConfig);
    executorConfig.setParallelConfig(parallelConfig);

    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 maxPromptLen = 20;
    SizeType32 maxMaxNewTokens = 20;

    SizeType32 endId = -1;
    // Enqueue the requests
    std::unordered_map<IdType, VecTokens> tokens;
    std::unordered_map<IdType, SizeType32> expectedNumTokens;
    for (SizeType32 req = 0; req < numRequests; ++req)
    {
        SizeType32 promptLen = rand() % maxPromptLen + 1;
        SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;

        auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
            tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
        auto reqId = executor.enqueueRequest(std::move(request));
        tokens[reqId] = {};
        expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens;
    }

    std::atomic<bool> statsThreadDone = false;
    std::atomic<int32_t> numFinished = 0;
    std::deque<IterationStats> iterStatsReceived;
    // Spawn a thread that continuously get stats
    auto statsThread = std::thread(
        [&executor, &numFinished, numRequests, &iterStatsReceived, &statsThreadDone]()
        {
            while (numFinished < numRequests)
            {
                auto reqStats = executor.getLatestIterationStats();
                iterStatsReceived.insert(iterStatsReceived.end(), std::make_move_iterator(reqStats.begin()),
                    std::make_move_iterator(reqStats.end()));
                std::this_thread::sleep_for(std::chrono::milliseconds(10));
            }
            statsThreadDone = true;
        });

    // Get the new tokens for each requests
    int iter = 0;
    SizeType32 numResponses = 0;
    while (numFinished < numRequests && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            numResponses++;
            if (!response.hasError())
            {
                auto result = response.getResult();
                numFinished += result.isFinal;
                auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
                auto& reqTokens = tokens.at(response.getRequestId());
                reqTokens.insert(reqTokens.end(), std::make_move_iterator(newTokens.begin()),
                    std::make_move_iterator(newTokens.end()));
            }
            else
            {
                // Allow response with error only if awaitResponse processed a terminated request id
                std::string err = "ReqId " + std::to_string(response.getRequestId())
                    + " has already been processed and was terminated.";
                EXPECT_EQ(response.getErrorMsg(), err);
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    // Check that number of tokens matches expectations
    for (auto const& [reqId, numTokens] : expectedNumTokens)
    {
        EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
    }

    // Wait for stats thread to be done, fail otherwise
    iter = 0;
    while (!statsThreadDone && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        std::this_thread::sleep_for(std::chrono::milliseconds(waitTime));
        iter++;
    }
    ASSERT_TRUE(statsThreadDone);
    if (iterStatsMaxIterations > 0)
    {
        ASSERT_GT(iterStatsReceived.size(), 1);

        for (auto stats : iterStatsReceived)
        {
            EXPECT_GT(stats.numActiveRequests, 0);
            TLLM_LOG_INFO("%d %d", stats.iter, stats.numActiveRequests);

            EXPECT_TRUE(stats.inflightBatchingStats.has_value());
            if (stats.inflightBatchingStats.has_value())
            {
                EXPECT_GT(stats.inflightBatchingStats.value().numScheduledRequests, 0);
            }
        }
    }

    statsThread.join();
}

TEST_P(ParamTest, MultipleRequestBatchResponses)
{
    bool const streaming = std::get<0>(GetParam());
    bool const excludeInputFromOutput = std::get<1>(GetParam());
    auto const beamWidth = std::get<2>(GetParam());
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;
    SizeType32 constexpr numRequests{20};

    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 constexpr maxPromptLen{20};
    SizeType32 constexpr maxMaxNewTokens{20};

    SizeType32 endId = -1;
    // Enqueue the requests
    std::unordered_map<IdType, VecTokens> tokens;
    std::unordered_map<IdType, SizeType32> expectedNumTokens;
    std::vector<IdType> requestIds;
    for (SizeType32 req = 0; req < numRequests; ++req)
    {
        SizeType32 promptLen = rand() % maxPromptLen + 1;
        SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;

        auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
            tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
        auto reqId = executor.enqueueRequest(std::move(request));
        requestIds.push_back(reqId);
        tokens[reqId] = {};
        expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens;
    }

    // Get the new tokens for each requests
    int32_t numFinished = 0;
    int iter = 0;
    SizeType32 numResponses = 0;
    std::chrono::milliseconds waitTime(1);
    while (numFinished < numRequests && iter < mMaxWaitMs)
    {
        auto idResponses = executor.awaitResponses(requestIds, waitTime);
        for (unsigned i = 0; i < requestIds.size(); ++i)
        {
            auto& responses = idResponses[i];
            for (auto& response : responses)
            {
                numResponses++;
                if (!response.hasError())
                {
                    auto result = response.getResult();
                    numFinished += result.isFinal;
                    auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
                    auto& reqTokens = tokens.at(response.getRequestId());
                    if (streaming && beamWidth > 1)
                    {
                        reqTokens = newTokens;
                    }
                    else
                    {
                        reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
                    }
                }
                else
                {
                    // Allow response with error only if awaitResponse processed a terminated request id
                    std::string err = "ReqId " + std::to_string(response.getRequestId())
                        + " has already been processed and was terminated.";
                    EXPECT_EQ(response.getErrorMsg(), err);
                }
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    // Rerun awaitResponses again and we expect to only see terminated request id error.
    auto idResponses = executor.awaitResponses(requestIds, waitTime);
    for (auto const& responses : idResponses)
    {
        for (auto& response : responses)
        {
            EXPECT_TRUE(response.hasError());
            std::string err = "ReqId " + std::to_string(response.getRequestId())
                + " has already been processed and was terminated.";
            EXPECT_EQ(response.getErrorMsg(), err);
        }
    }

    // Check that number of tokens matches expectations
    for (auto const& [reqId, numTokens] : expectedNumTokens)
    {
        EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
    }
}

TEST_P(ParamTest, GetNumResponsesReadyTest)
{
    bool const streaming = std::get<0>(GetParam());
    bool const excludeInputFromOutput = std::get<1>(GetParam());
    auto const beamWidth = std::get<2>(GetParam());
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 maxNumRequests = 50;
    SizeType32 maxPromptLen = 20;
    SizeType32 maxMaxNewTokens = 20;

    SizeType32 numRequests = rand() % maxNumRequests + 1;
    SizeType32 numExpectedResponses = 0;
    std::map<IdType, SizeType32> reqNumExpectedResponses;
    std::vector<IdType> ids;
    for (SizeType32 req = 0; req < numRequests; ++req)
    {
        SizeType32 promptLen = rand() % maxPromptLen + 1;
        SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;

        auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
            tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
        auto id = executor.enqueueRequest(std::move(request));
        ids.emplace_back(id);
        reqNumExpectedResponses[id] = streaming ? maxNewTokens : 1;
        numExpectedResponses += reqNumExpectedResponses.at(id);
    }

    SizeType32 iter = 0;
    SizeType32 numReady = 0;
    while (numReady < numExpectedResponses && iter < mMaxWaitMs)
    {
        numReady = 0;
        for (auto id : ids)
        {
            numReady += executor.getNumResponsesReady(id);
        }
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);
    // Expect one response per request
    for (auto id : ids)
    {
        SizeType32 numReady = executor.getNumResponsesReady(id);
        EXPECT_EQ(numReady, reqNumExpectedResponses.at(id));
    }
    auto numResponsesReady = executor.getNumResponsesReady();
    EXPECT_EQ(numResponsesReady, numExpectedResponses);
}

namespace
{

void runTest(Executor& executor, fs::path const& inputPath, ModelIds const& modelIds,
    FlakyTestInfo const& flakyTestInfo, bool streaming, SizeType32 const vocabSizePadded, BeamResult const& beamResult,
    OutputConfig const& outConfig, bool isSpeculativeDecoding, int maxWaitMs, bool returnAllGeneratedTokens,
    SizeType32 const numReturnSequences, bool isNonGreedySampling, SizeType32 const modelParallelism)
{
    auto const beamWidth = beamResult.beamWidth;

    auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
    auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
    auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
    auto const* const givenInputData = tr::bufferCast<TokenIdType const>(*givenInput);

    auto const& inputShape = givenInput->getShape();
    ASSERT_EQ(inputShape.nbDims, 2);
    ASSERT_GT(inputShape.d[0], 0);

    // Load expected outputs for each beam width value
    auto testData = TestData::loadTestData(beamResult, *givenInput, beamWidth, manager, outConfig, modelIds);
    auto const maxSeqLen = testData.maxSeqLen;

    // Load expected outputs and inputs
    SizeType32 numRequests = static_cast<SizeType32>(givenInputLengths.size());
    SizeType32 maxRequests = numRequests;
    std::vector<Request> requests;
    std::vector<SizeType32> reqMaxNewTokens;

    auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
    // top-k will be set by a large number to test non-identical N sequences.
    if (isNonGreedySampling)
    {
        samplingConfig.setTopK(32);
    }
    samplingConfig.setNumReturnSequences(numReturnSequences);

    for (SizeType32 req = 0; req < maxRequests; ++req)
    {
        SizeType32 inputLen = givenInputLengths.at(req);
        auto maxNewTokens = maxSeqLen - maxInputLength;
        reqMaxNewTokens.push_back(maxNewTokens);
        SizeType32 endId = -1;
        auto const* const seqBegin = givenInputData + req * maxInputLength;
        VecTokens tokens(seqBegin, seqBegin + inputLen);
        auto request = Request(
            VecTokens(seqBegin, seqBegin + inputLen), maxNewTokens, streaming, samplingConfig, outConfig, endId);
        request.setReturnAllGeneratedTokens(returnAllGeneratedTokens);
        requests.emplace_back(std::move(request));
    }

    auto& comm = tensorrt_llm::mpi::MpiComm::world();
    auto const worldRank = comm.getRank();

    // Expected return sizes.
    auto const numSequences = beamWidth > 1 ? 1 : numReturnSequences;
    auto const numReturnBeams = std::min(beamWidth, numReturnSequences);

    if (worldRank == 0)
    {
        auto const reqIds = executor.enqueueRequests(requests);

        std::unordered_map<SizeType32, std::vector<BeamTokens>> tokens;
        std::unordered_map<IdType, SizeType32> reqIdToBatchId;

        for (SizeType32 req = 0; req < reqIds.size(); ++req)
        {
            std::vector<BeamTokens> resultTokens(numSequences, BeamTokens(numReturnBeams));
            tokens[req] = std::move(resultTokens);
            reqIdToBatchId[reqIds.at(req)] = req;
        }

        // Get the new tokens for each requests
        int32_t numFinished = 0;
        int iter = 0;
        std::unordered_map<IdType, SizeType32> numResponses;
        while (numFinished < maxRequests && iter < maxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
            for (auto& response : responses)
            {
                auto batchId = reqIdToBatchId.at(response.getRequestId());
                numResponses[batchId]++;
                if (!response.hasError())
                {
                    auto result = response.getResult();
                    numFinished += result.isFinal;
                    auto seqIdx = result.sequenceIndex;

                    auto const& contextLogits = result.contextLogits;
                    auto const& genLogits = result.generationLogits;
                    auto const& outputTokenIds = result.outputTokenIds;

                    EXPECT_EQ(result.finishReasons.size(), numReturnBeams);
                    for (SizeType32 beam = 0; beam < numReturnBeams; ++beam)
                    {
                        auto const& newTokens = outputTokenIds.at(beam);
                        auto& reqTokens = tokens.at(batchId).at(seqIdx).at(beam);

                        if (!returnAllGeneratedTokens)
                        {
                            reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
                        }
                        else
                        {
                            EXPECT_EQ(newTokens.size(),
                                (numResponses.at(batchId) + numReturnSequences - 1) / numReturnSequences);
                            reqTokens = newTokens;
                        }
                        // FinishReason is only supported for bw=1 and inflight batching.
                        if (beamWidth == 1)
                        {
                            EXPECT_EQ(result.finishReasons.at(beam),
                                result.isSequenceFinal ? FinishReason::kLENGTH : FinishReason::kNOT_FINISHED);
                        }
                    }

                    auto const& cumLogProbs = result.cumLogProbs;
                    auto const& logProbs = result.logProbs;
                    auto const& beamTokens = tokens.at(batchId).at(seqIdx);
                    EXPECT_EQ(beamTokens.size(), numReturnBeams);

                    if (!isNonGreedySampling)
                    {
                        float const logitsAtol = modelParallelism > 1 ? 1e-1 : 1e-2;
                        float const logitsRtol = modelParallelism > 1 ? 1e-2 : 1e-3;

                        testData.verifyLogProbs(outConfig.returnLogProbs, streaming, outConfig.excludeInputFromOutput,
                            givenInputLengths.at(batchId), beamWidth, beamTokens, cumLogProbs, logProbs, batchId,
                            flakyTestInfo);
                        testData.validateContextLogits(outConfig.returnContextLogits, givenInputLengths.at(batchId),
                            beamWidth, contextLogits, vocabSizePadded, batchId, logitsAtol, logitsRtol);
                        testData.validateGenerationLogits(outConfig.returnGenerationLogits, result.isSequenceFinal,
                            streaming, outConfig.excludeInputFromOutput, givenInputLengths.at(batchId),
                            reqMaxNewTokens.at(batchId), beamWidth, beamTokens, genLogits, vocabSizePadded, batchId,
                            returnAllGeneratedTokens, logitsAtol, logitsRtol);
                    }

                    // Ignore first iteration as it doesn't use draft tokens
                    if (outConfig.returnPerfMetrics && isSpeculativeDecoding
                        && result.requestPerfMetrics.value().iter > 0)
                    {
                        auto& specDecMetrics = result.requestPerfMetrics.value().speculativeDecoding;
                        // 4 draft tokens are used per step
                        EXPECT_EQ(specDecMetrics.totalDraftTokens, result.requestPerfMetrics.value().iter.value() * 4);
                        EXPECT_EQ(specDecMetrics.acceptanceRate,
                            static_cast<float>(specDecMetrics.totalAcceptedDraftTokens)
                                / specDecMetrics.totalDraftTokens);
                    }
                }
                else
                {
                    // Allow response with error only if awaitResponse processed a terminated request id
                    std::string err = "ReqId " + std::to_string(response.getRequestId())
                        + " has already been processed and was terminated.";
                    EXPECT_EQ(response.getErrorMsg(), err);
                }
            }
            ++iter;
        }
        EXPECT_LT(iter, maxWaitMs);
        testData.verifyOutput(tokens, givenInputLengths, streaming, outConfig.excludeInputFromOutput, flakyTestInfo,
            isSpeculativeDecoding, beamWidth, numSequences, isNonGreedySampling);
    }
}

void runTest(fs::path const& modelPath, ExecutorConfig const& executorConfig, fs::path const& inputPath,
    ModelIds const& modelIds, FlakyTestInfo const& flakyTestInfo, bool streaming, SizeType32 const vocabSizePadded,
    BeamResult const& beamResult, OutputConfig const& outConfig, bool isSpeculativeDecoding, int maxWaitMs,
    bool returnAllGeneratedTokens, SizeType32 const numReturnSequences, bool isNonGreedySampling,
    SizeType32 const modelParallelism)
{
    auto executor = Executor{modelPath, ModelType::kDECODER_ONLY, executorConfig};

    runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig,
        isSpeculativeDecoding, maxWaitMs, returnAllGeneratedTokens, numReturnSequences, isNonGreedySampling,
        modelParallelism);
}

ExecutorConfig createExecutorConfig(SizeType32 maxBeamWidth, bool useOrchestratorMode, bool gatherGenerationLogits,
    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt,
    std::optional<std::vector<SizeType32>> participantIds = std::nullopt)
{
    // Note: we reduce memory fraction for cases that return context/generation logits which require more free
    // memory
    FloatType constexpr freeGpuMemoryFraction{0.5F};
    KvCacheConfig kvCacheConfig(false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
    auto executorConfig = ExecutorConfig(maxBeamWidth);
    executorConfig.setKvCacheConfig(kvCacheConfig);
    executorConfig.setNormalizeLogProbs(false);
    executorConfig.setGatherGenerationLogits(gatherGenerationLogits);

    std::optional<OrchestratorConfig> orchestratorConfig = std::nullopt;
    if (useOrchestratorMode)
    {
        orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
    }
    auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
        useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::move(deviceIds),
        std::move(participantIds), orchestratorConfig);
    executorConfig.setParallelConfig(parallelConfig);

    return executorConfig;
}

} // namespace

TEST_P(AllParamsTest, TokenComparison)
{
    auto const streaming = std::get<0>(GetParam());
    auto const& beamWidth = std::get<1>(GetParam());
    OutputConfig outConfig;
    outConfig.returnLogProbs = std::get<2>(GetParam());
    outConfig.excludeInputFromOutput = std::get<3>(GetParam());
    outConfig.returnContextLogits = std::get<4>(GetParam());
    outConfig.returnGenerationLogits = std::get<5>(GetParam());
    auto const modelName = std::get<6>(GetParam());
    auto const useOrchestratorMode = std::get<7>(GetParam());
    auto const returnAllGeneratedTokens = std::get<8>(GetParam());
    auto const numReturnSequences = std::get<9>(GetParam());
    if (returnAllGeneratedTokens && !streaming)
    {
        GTEST_SKIP() << "Test does not support returnAllGeneratedTokens without streaming";
    }

    std::optional<std::vector<SizeType32>> participantIds = std::nullopt;

    BeamResult beamResult{beamWidth};

    ASSERT_TRUE(fs::exists(DATA_PATH));

    fs::path modelPath;
    // set defaults and adjust if needed by different models
    fs::path inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};
    bool isSpeculativeDecoding{false};

    SizeType32 vocabSizePadded = 50257;

    // NOTE: This can be used to disable checks for certain prompt batch entries
    FlakyTestInfo flakyTestInfo;

    if (modelName == "gpt")
    {
        auto const resultsPath
            = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
        if (outConfig.returnContextLogits || outConfig.returnGenerationLogits)
        {
            modelPath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu";
            beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
            beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
            beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();
            if (outConfig.returnLogProbs)
            {
                beamResult.cumLogProbsFile
                    = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_CUM_LOG_PROBS_FILE();
                beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_LOG_PROBS_FILE();
            }
        }
        else
        {
            modelPath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
            beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_FILE();
            if (outConfig.returnLogProbs)
            {
                beamResult.cumLogProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CUM_LOG_PROBS_FILE();
                beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_LOG_PROBS_FILE();
            }
        }
    }
    else if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1"
        || modelName == "llama_tp1_pp2_cp1")
    {
        inputPath = DATA_PATH / LLAMA_INPUT_FILE;
        modelIds.padId = LLAMA_PAD_ID;
        modelIds.endId = LLAMA_END_ID;

        vocabSizePadded = LLAMA_VOCAB_SIZE_PADDED;

        auto const resultsPath
            = LLAMA_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
        if (modelName == "llama_tp4_pp1_cp1")
        {
            beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP4_PP1_FILE();
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
        }
        else if (modelName == "llama_tp1_pp4_cp1")
        {
            beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP4_FILE();
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
        }
        else if (modelName == "llama_tp1_pp2_cp1")
        {
            beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP1_PP2_FILE();
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp2-cp1-gpu";
        }
        else if (modelName == "llama_tp2_pp2_cp1")
        {
            beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_TP2_PP2_FILE();
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
        }
        beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_TP4_PP1_FILE();
        if (outConfig.returnLogProbs)
        {
            beamResult.cumLogProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CUM_LOG_PROBS_TP4_PP1_FILE();
            beamResult.logProbsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_LOG_PROBS_TP4_PP1_FILE();
        }
    }
    else if (modelName == "medusa")
    {
        TLLM_CHECK_WITH_INFO(beamWidth == 1, "Medusa does not support beam search.");
        auto const resultsPath = MEDUSA_DATA_PATH / "sampling";
        auto modelSpec = ModelSpec::getDefaultModelSpec()
                             .useMedusa()
                             .setInputFile("input_tokens_long.npy")
                             .setMaxOutputLength(128);
        beamResult.resultsFile = resultsPath / modelSpec.getResultsFile();
        modelPath = MEDUSA_MODEL_PATH / modelSpec.getModelPath() / "tp1-pp1-cp1-gpu";

        inputPath = DATA_PATH / "input_vicuna.npy";
        modelIds.padId = 2;
        modelIds.endId = 2;
        isSpeculativeDecoding = true;
        outConfig.returnPerfMetrics = true;
    }
    else if (modelName == "chatglm" || modelName == "chatglm2" || modelName == "chatglm3" || modelName == "glm")
    {
        fs::path resultsPath;
        if (modelName == "chatglm")
        {
            resultsPath = CHATGLM_DATA_PATH;
            modelPath = CHATGLM_MODEL_PATH;
        }
        else if (modelName == "chatglm2")
        {
            resultsPath = CHATGLM2_DATA_PATH;
            modelPath = CHATGLM2_MODEL_PATH;
        }
        else if (modelName == "chatglm3")
        {
            resultsPath = CHATGLM3_DATA_PATH;
            modelPath = CHATGLM3_MODEL_PATH;
        }
        else if (modelName == "glm")
        {
            resultsPath = GLM_DATA_PATH;
            modelPath = GLM_MODEL_PATH;
        }
        resultsPath /= (beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth);
        beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_RESULT_FILE();
        modelPath = modelPath / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";

        char versionChatglm{0};
        if (size_t index = modelPath.string().find("chatglm"); index != std::string::npos)
        {
            versionChatglm = modelPath.string()[index + 7];
            std::string const vChatglmString
                = (versionChatglm == '-') ? std::string("") : std::string(1, versionChatglm);
            inputPath = DATA_PATH / ("input_tokens_chatglm" + vChatglmString + "-6b.npy");
            modelIds.padId = (versionChatglm == '-') ? 3 : 0;
            modelIds.endId = (versionChatglm == '-') ? 130005 : 2;
        }
        else if (size_t index = modelPath.string().find("glm-10b"); index != std::string::npos)
        {
            inputPath = DATA_PATH / "input_tokens_glm-10b.npy";
            modelIds.padId = 50256;
            modelIds.endId = 50258;
        }

        if (versionChatglm != 0)
        {
            flakyTestInfo.batchIdBeams.insert(std::make_pair(1, 0));
        }
    }
    else
    {
        TLLM_THROW("Unrecognized modelName");
    }

    if (streaming && beamWidth > 1)
    {
        GTEST_SKIP() << "Test does not support streaming with beam search";
    }

    // Warning: This should be the last check before running the test.
    // It will initialize MPI which can take significant time.
    if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1"
        || modelName == "llama_tp1_pp2_cp1")
    {
        // For llama model, only run for multiple GPUs
        // This is detected by setting an env variable when running the test
        char const* val = getenv("RUN_LLAMA_MULTI_GPU");
        if (val == nullptr)
        {
            GTEST_SKIP() << "Skipping Llama test";
        }

        if (outConfig.returnContextLogits)
        {
            GTEST_SKIP() << "Skipping context logits tests for mpi runs";
        }

        // Check that it was launched with right number of MPI ranks
        if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Leader mode and world size is not equal to 4";
        }
        if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Orchestrator mode and World size is not equal to 1";
        }
    }
    auto decoderJsonConfig = tensorrt_llm::runtime::GptJsonConfig::parse(modelPath / "config.json");

    auto const modelTP = decoderJsonConfig.getTensorParallelism();
    auto const modelPP = decoderJsonConfig.getPipelineParallelism();
    auto const modelParallelism = modelTP * modelPP;
    int deviceCount = -1;
    TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
    std::optional<std::vector<SizeType32>> deviceIds = std::vector<SizeType32>(modelParallelism);
    for (auto i = 0; i < deviceIds->size(); i++)
    {
        deviceIds->at(i) = i % deviceCount;
    }
    if (modelName == "llama_tp1_pp2_cp1")
    {
        auto const& session = tensorrt_llm::mpi::MpiComm::world();
        if (session.getSize() != 4)
        {
            FAIL() << "Llama-tp1-pp2 is intended solely for testing coexisting engines within the same MPI world,"
                      " which requires a session size of 4. However, the current session size is "
                   << session.getSize() << " .";
        }
        if (session.getRank() / 2 == 0)
        {
            participantIds = std::vector<SizeType32>{0, 1};
            deviceIds = std::vector<SizeType32>{0, 1};
        }
        else
        {
            participantIds = std::vector<SizeType32>{2, 3};
            deviceIds = std::vector<SizeType32>{2, 3};
        }
    }

    if (modelPP > 1)
    {
        std::reverse(deviceIds->begin(), deviceIds->end());
        if (modelTP > 1)
        {
            for (SizeType32 ppRank = 0; ppRank < modelPP; ppRank++)
            {
                std::reverse(deviceIds->begin() + ppRank * modelTP, deviceIds->begin() + (ppRank + 1) * modelPP);
            }
        }
    }

    // Returning logits will bring higher latency
    if (streaming && (outConfig.returnContextLogits || outConfig.returnGenerationLogits))
    {
        mMaxWaitMs = 20000;
    }

    auto executorConfig = createExecutorConfig(beamWidth, useOrchestratorMode, outConfig.returnGenerationLogits,
        std::move(deviceIds), std::move(participantIds));

    runTest(modelPath, executorConfig, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult,
        outConfig, isSpeculativeDecoding, mMaxWaitMs, returnAllGeneratedTokens, numReturnSequences, false,
        modelParallelism);
}

TEST_F(GptExecutorTest, ChangeBeamWidth)
{
    SizeType32 constexpr maxBeamWidth{2};
    auto executorConfig = ExecutorConfig(maxBeamWidth);

    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 constexpr beamWidth1{1};
    SizeType32 constexpr beamWidth2{2};
    SizeType32 constexpr maxNewTokens{2};
    VecTokens inputTokens{1, 2, 3, 4};

    // Create requests with different beam widths
    std::vector<Request> requests;
    requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1));
    requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1));
    requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth2));
    requests.emplace_back(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth1));

    auto requestIds = executor.enqueueRequests(requests);

    int numFinished = 0;
    int iter = 0;
    while (numFinished < 4 && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                auto err = response.getErrorMsg();
                std::cout << "err:" << err << std::endl;
                FAIL() << "Should not get a response with error";
            }
            else
            {
                auto result = response.getResult();
                numFinished += static_cast<int>(result.isFinal);
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);

    auto stats = executor.getLatestIterationStats();
    uint64_t currentIter = 0;
    for (auto const& stat : stats)
    {
        // TODO: enable this check when stats are cleaned
        // EXPECT_EQ(stat.iter, currentIter);
        if (stat.iter < 2)
        {
            // req 1 and 2 run with same beam width
            EXPECT_EQ(stat.numActiveRequests, 2);
        }
        else if (stat.numActiveRequests != 0) // TODO: remove this check when stats are cleaned
        {
            // req 3 or 4 run width different beam width
            EXPECT_EQ(stat.numActiveRequests, 1);
        }

        ++currentIter;
    }
}

void doTokenComparisonChangeBeamWidth(bool enableReuse, SizeType32 maxWaitMs)
{
    SizeType32 constexpr maxBeamWidth{2};
    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
    auto constexpr streaming = false;

    // Create executor config
    auto kvCacheConfig = KvCacheConfig(enableReuse);
    auto executorConfig = ExecutorConfig(maxBeamWidth, SchedulerConfig(), kvCacheConfig);

    // Create executor
    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    auto const inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};

    OutputConfig outConfig;
    FlakyTestInfo flakyTestInfo;
    bool constexpr isSpeculativeDecoding{false};

    for (SizeType32 beamWidth : {1, 2})
    {
        BeamResult beamResult{beamWidth};
        auto const resultsPath
            = GPT_DATA_PATH / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
        beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
        beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
        beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();

        runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig,
            isSpeculativeDecoding, maxWaitMs, false, 1, false, 1);
    }
}

TEST_F(GptExecutorTest, TokenComparisonChangeBeamWidth)
{
    doTokenComparisonChangeBeamWidth(false, mMaxWaitMs);
}

TEST_F(GptExecutorTest, TokenComparisonChangeBeamWidthBlockReuse)
{
    doTokenComparisonChangeBeamWidth(true, mMaxWaitMs);
}

TEST_F(GptExecutorTest, NReturnRandomness)
{
    SizeType32 constexpr maxBeamWidth{1};
    SizeType32 constexpr numReturnSequences{2};
    SizeType32 constexpr vocabSizePadded{50257}; // gpt vocabSizePadded
    auto constexpr streaming = false;

    // Create executor config
    auto executorConfig = ExecutorConfig(maxBeamWidth);

    // Create executor
    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    auto const inputPath = DATA_PATH / "input_tokens.npy";
    ModelIds modelIds{50256, 50256};

    OutputConfig outConfig;
    FlakyTestInfo flakyTestInfo;
    bool constexpr isSpeculativeDecoding{false};

    BeamResult beamResult{maxBeamWidth};
    auto const resultsPath = GPT_DATA_PATH / "sampling";
    beamResult.resultsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GATHER_RESULT_FILE();
    beamResult.contextLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_CONTEXT_LOGITS_FILE();
    beamResult.genLogitsFile = resultsPath / PathUtil::FP16_PLUGIN_PACKED_PAGED_GENERATION_LOGITS_FILE();

    runTest(executor, inputPath, modelIds, flakyTestInfo, streaming, vocabSizePadded, beamResult, outConfig,
        isSpeculativeDecoding, mMaxWaitMs, false, 1, true, 1);
}

TEST_F(GptExecutorTest, TimedOut)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // No requests enqueued, expect no responses
    auto numResponsesReady = executor.getNumResponsesReady();
    EXPECT_EQ(numResponsesReady, 0);

    std::chrono::milliseconds waitTime(10);
    auto responses = executor.awaitResponses(waitTime);
    EXPECT_EQ(responses.size(), 0);
}

TEST_F(GptExecutorTest, MaxSeqIdleMicrosecondsError)
{
    auto executorConfig = ExecutorConfig(1);
    // Request will time out
    executorConfig.setMaxSeqIdleMicroseconds(1);
    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 constexpr maxNewTokens{5};
    VecTokens inputTokens{1, 2, 3, 4};

    std::vector<Request> requests;
    requests.emplace_back(inputTokens, maxNewTokens, false);

    auto requestIds = executor.enqueueRequests(requests);

    bool done = false;
    int iter = 0;
    while (!done && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                auto err = response.getErrorMsg();
                std::cout << "err:" << err << std::endl;
                EXPECT_THAT(err, testing::HasSubstr("Unable to get batch slot for reqId"));
                done = true;
            }
            else
            {
                FAIL() << "Should get a response with error";
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);
}

void logitsProcessorMixedReqsTest(std::string const& modelDir, SizeType32 worldRank, SizeType32 maxWaitMs,
    bool replicated, std::optional<std::vector<SizeType32>> deviceIds);

TEST_P(LogitsProcParamsTest, All)
{
    auto const modelName = std::get<0>(GetParam());
    auto const batched = std::get<1>(GetParam());
    auto const replicated = std::get<2>(GetParam());

    std::string modelDir;
    int tp_size = 1, pp_size = 1, cp_size = 1;
    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

    if (modelName == "llama_tp1_pp1_cp1")
    {
        modelDir = "tp1-pp1-cp1-gpu";
    }
    else if (modelName == "llama_tp4_pp1_cp1")
    {
        modelDir = "tp4-pp1-cp1-gpu";
        tp_size = 4;
    }
    else if (modelName == "llama_tp1_pp4_cp1")
    {
        modelDir = "tp1-pp4-cp1-gpu";
        pp_size = 4;
        deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
    }
    else if (modelName == "llama_tp2_pp2_cp1")
    {
        modelDir = "tp2-pp2-cp1-gpu";
        tp_size = pp_size = 2;
        deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
    }
    else
    {
        TLLM_THROW("Unrecognized modelName");
    }
    std::filesystem::path modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / modelDir;

    auto& comm = tensorrt_llm::mpi::MpiComm::world();
    auto const worldRank = comm.getRank();
    auto const worldSize = comm.getSize();

    if (tp_size * pp_size * cp_size != 1)
    {
        // Run multi GPU test only when env variable is set
        char const* val = getenv("RUN_LLAMA_MULTI_GPU");
        if (val == NULL)
        {
            GTEST_SKIP() << "Skipping multi-gpu logits post processor test";
        }

        if (worldSize != 4)
        {
            FAIL() << "Leader mode and world size is not equal to 4";
        }
    }
    else
    {
        // This has no effect for single-GPU tests
        if (replicated)
        {
            GTEST_SKIP() << "Skipping single-gpu replicated logits post processor test";
        }
    }

    // Configuration options
    bool const streaming = false;
    bool excludeInputFromOutput = false;
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;
    SizeType32 numRequests = 20;
    IdType const kClientId = 1234;

    SizeType32 beamWidth = 1;
    SizeType32 maxPromptLen = 20;
    SizeType32 maxMaxNewTokens = 20;

    SizeType32 constexpr endId{2};
    SizeType32 constexpr vocabSizePadded{32000}; // llama-7b vocabSizePadded
    // We just use tokenIdCalculator to generate a token_id based on request index, output position and max new tokens.
    // Then LogitsPostProcessor set all other logits except the generated token_id to large negative value.
    // So the output token should be the generated token by tokenIdCalculator.
    auto tokenIdCalculator = [endId, vocabSizePadded](IdType req, SizeType32 pos)
    {
        SizeType32 tokenId = (req * 1000 + pos) % vocabSizePadded;
        if (tokenId == endId)
        {
            tokenId = 0;
        }
        return tokenId;
    };

    std::unordered_map<IdType, VecTokens> tokens;
    std::unordered_map<IdType, SizeType32> expectedNumTokens;
    std::unordered_map<IdType, VecTokens> expectedOutputTokens;

    // Enqueue the requests
    auto enqueueRequests = [&](Executor& executor, std::optional<std::string const> logitsProcessorName,
                               std::optional<LogitsPostProcessor> logitsProcessor = std::nullopt)
    {
        tokens.clear();
        expectedNumTokens.clear();
        expectedOutputTokens.clear();

        for (SizeType32 req = 0; req < numRequests; ++req)
        {
            SizeType32 promptLen = rand() % maxPromptLen + 1;
            SizeType32 maxNewTokens = rand() % maxMaxNewTokens + 1;

            auto request = Request(VecTokens(promptLen, 1), maxNewTokens, streaming,
                tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig, endId);
            request.setClientId(kClientId);
            if (logitsProcessorName)
            {
                request.setLogitsPostProcessorName(logitsProcessorName.value());
            }
            else if (logitsProcessor)
            {
                request.setLogitsPostProcessor(logitsProcessor.value());
            }
            auto reqId = executor.enqueueRequest(std::move(request));
            tokens[reqId] = {};
            expectedNumTokens[reqId] = (streaming ? 0 : (excludeInputFromOutput ? 0 : promptLen)) + maxNewTokens;
            expectedOutputTokens[reqId] = {};
            if (!streaming && !excludeInputFromOutput)
            {
                expectedOutputTokens[reqId].resize(promptLen, 1);
            }
            for (SizeType32 outputPos = 0; outputPos < maxNewTokens; ++outputPos)
            {
                SizeType32 outputTokenId = tokenIdCalculator(reqId, outputPos + promptLen);
                expectedOutputTokens[reqId].push_back(outputTokenId);
            }
        }
    };

    // Get the new tokens for each requests
    auto collectResponses = [&](Executor& executor)
    {
        int32_t numFinished = 0;
        int iter = 0;
        SizeType32 numResponses = 0;
        while (numFinished < numRequests && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
            for (auto& response : responses)
            {
                numResponses++;
                if (!response.hasError())
                {
                    EXPECT_EQ(response.getClientId().value(), kClientId);
                    auto result = response.getResult();
                    numFinished += result.isFinal;
                    auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
                    auto& reqTokens = tokens.at(response.getRequestId());
                    reqTokens.insert(reqTokens.end(), std::make_move_iterator(newTokens.begin()),
                        std::make_move_iterator(newTokens.end()));
                }
                else
                {
                    // Allow response with error only if awaitResponse processed a terminated request id
                    std::string err = "ReqId " + std::to_string(response.getRequestId())
                        + " has already been processed and was terminated.";
                    EXPECT_EQ(response.getErrorMsg(), err);
                }
            }
            ++iter;
        }
        EXPECT_LT(iter, mMaxWaitMs);
    };

    // Check that tokens matches expectations
    auto checkOutput = [&]()
    {
        for (auto const& [reqId, numTokens] : expectedNumTokens)
        {
            EXPECT_EQ(expectedNumTokens[reqId], tokens[reqId].size()) << "reqId " << reqId;
            for (SizeType32 tokenPos = 0;
                 tokenPos < std::min<SizeType32>(expectedNumTokens[reqId], tokens[reqId].size()); ++tokenPos)
            {
                EXPECT_EQ(expectedOutputTokens[reqId][tokenPos], tokens[reqId][tokenPos])
                    << "reqId=" << reqId << ", tokenPos=" << tokenPos;
            }
        }
    };

    // Test non-batched logits processor
    std::string const logitsProcessorName = "SelectToken";

    auto logitsPostProcessorFn = [&](IdType reqId, Tensor& logits, BeamTokens const& tokens, StreamPtr const& streamPtr,
                                     std::optional<IdType> clientId)
    {
        if (replicated)
        {
            EXPECT_TRUE(worldRank <= tp_size - 1);
        }
        else
        {
            EXPECT_TRUE(worldRank == 0);
        }
        EXPECT_TRUE(clientId.value() == kClientId);
        SizeType32 numTokens = tokens.at(0).size();
        SizeType32 pos = numTokens;
        SizeType32 outputTokenId = tokenIdCalculator(reqId, pos);
        auto logitsDataType = logits.getDataType();
        EXPECT_TRUE(logitsDataType == DataType::kFP16 || logitsDataType == DataType::kBF16
            || logitsDataType == DataType::kFP32);
        // logits has shape [draftLength + 1, reqBeamWidth, vocabSize]
        auto logitsCpu = tensorrt_llm::executor::Tensor::cpu(logitsDataType, logits.getShape());
        auto* dataPtr = logitsCpu.getData();
        auto eltSize = logitsCpu.getSizeInBytes() / logitsCpu.getSize();
        EXPECT_TRUE(eltSize == 2 || eltSize == 4);
        if (eltSize == 2)
        {
            auto* dataPtrU16 = static_cast<uint16_t*>(dataPtr);
            uint16_t hugeNegValue = logitsDataType == DataType::kFP16 ? 0xFBFF : 0xFF7F; // a huge negative value
            for (size_t i = 0; i < logitsCpu.getSize(); ++i)
            {
                dataPtrU16[i] = hugeNegValue;
            }
            dataPtrU16[outputTokenId] = 0;
        }
        else
        {
            auto* dataPtrFloat = static_cast<float*>(dataPtr);
            for (size_t i = 0; i < logitsCpu.getSize(); ++i)
            {
                dataPtrFloat[i] = -HUGE_VALF;
            }
            dataPtrFloat[outputTokenId] = 0.0f;
        }

        logits.setFrom(logitsCpu, streamPtr);
    };

    if (!batched)
    {
        auto executorConfig = ExecutorConfig(beamWidth);
        LogitsPostProcessorConfig logitsProcConfig{
            std::unordered_map<std::string, tensorrt_llm::executor::LogitsPostProcessor>{
                {logitsProcessorName, logitsPostProcessorFn}},
            std::nullopt, replicated};
        executorConfig.setLogitsPostProcessorConfig(logitsProcConfig);
        if (deviceIds.has_value())
        {
            auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
            parallelConfig.setDeviceIds(deviceIds.value());
            executorConfig.setParallelConfig(parallelConfig);
        }
        auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);

        if (worldRank == 0)
        {
            enqueueRequests(executor, logitsProcessorName);
            collectResponses(executor);
            checkOutput();

            if (!replicated || tp_size == 1)
            {
                // Dynamic logits postprocessor must be used with replicate=false or no tensor parallelism.
                enqueueRequests(executor, std::nullopt, logitsPostProcessorFn);
                collectResponses(executor);
                checkOutput();
            }
        }
    }

    // Test batched logits processor
    auto logitsPostProcessorBatchedFn
        = [logitsPostProcessorFn](std::vector<IdType> const& reqIdBatch, std::vector<Tensor>& logitsBatch,
              std::vector<std::reference_wrapper<BeamTokens const>> const& tokensBatch, StreamPtr const& streamPtr,
              std::vector<std::optional<IdType>> const& clientIdBatch)
    {
        for (int sample = 0; sample < reqIdBatch.size(); sample++)
        {
            logitsPostProcessorFn(
                reqIdBatch[sample], logitsBatch[sample], tokensBatch[sample], streamPtr, clientIdBatch[sample]);
        }
    };

    if (batched)
    {
        auto batchedExecutorConfig = ExecutorConfig(beamWidth);
        if (deviceIds.has_value())
        {
            auto parallelConfig = batchedExecutorConfig.getParallelConfig().value_or(ParallelConfig());

            parallelConfig.setDeviceIds(deviceIds.value());
            batchedExecutorConfig.setParallelConfig(parallelConfig);
        }
        LogitsPostProcessorConfig logitsProcConfig{std::nullopt, logitsPostProcessorBatchedFn, replicated};
        batchedExecutorConfig.setLogitsPostProcessorConfig(logitsProcConfig);

        auto batchedExecutor = Executor(modelPath, ModelType::kDECODER_ONLY, batchedExecutorConfig);

        if (worldRank == 0)
        {
            enqueueRequests(batchedExecutor, Request::kBatchedPostProcessorName);
            collectResponses(batchedExecutor);
            checkOutput();
        }
    }

    if (!batched)
    {
        logitsProcessorMixedReqsTest(modelDir, worldRank, mMaxWaitMs, replicated, std::move(deviceIds));
    }
}

// Test for mixing requests with and without logits processor.
void logitsProcessorMixedReqsTest(std::string const& modelDir, SizeType32 worldRank, SizeType32 maxWaitMs,
    bool replicated, std::optional<std::vector<SizeType32>> deviceIds)
{
    std::string const logitsProcessorName = "dummy";
    auto logitsPostProcessorFn = [&](IdType reqId, Tensor& logits, BeamTokens const& tokens, StreamPtr const& streamPtr,
                                     std::optional<IdType> clientId)
    {
        // Dummy callback that does not modify logits
        assert(!clientId.has_value());
    };

    LogitsPostProcessorConfig logitsProcConfig{
        std::unordered_map<std::string, tensorrt_llm::executor::LogitsPostProcessor>{
            {logitsProcessorName, logitsPostProcessorFn}},
        std::nullopt, replicated};

    // Create executor
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    executorConfig.setLogitsPostProcessorConfig(logitsProcConfig);
    if (deviceIds.has_value())
    {
        auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());

        parallelConfig.setDeviceIds(deviceIds.value());
        executorConfig.setParallelConfig(parallelConfig);
    }
    std::filesystem::path modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / modelDir;
    auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);

    if (worldRank == 0)
    {
        SizeType32 numRequests = 2;
        SizeType32 promptLen = 5;

        // First request with no LP and many output tokens
        auto request1 = Request(VecTokens(promptLen, 1), 25);
        // Second request with LP and few output tokens
        auto request2 = Request(VecTokens(promptLen, 1), 5);
        request2.setLogitsPostProcessorName(logitsProcessorName);

        // Enqueue requests
        auto reqId1 = executor.enqueueRequest(request1);
        auto reqId2 = executor.enqueueRequest(request2);

        // Wait for responses
        int32_t numFinished = 0;
        int iter = 0;
        SizeType32 numResponses = 0;
        while (numFinished < numRequests && iter < maxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
            for (auto& response : responses)
            {
                numResponses++;
                if (!response.hasError())
                {
                    auto result = response.getResult();
                    numFinished += result.isFinal;
                }
                else
                {
                    // Allow response with error only if awaitResponse processed a terminated request id
                    std::string err = "ReqId " + std::to_string(response.getRequestId())
                        + " has already been processed and was terminated.";
                    EXPECT_EQ(response.getErrorMsg(), err);
                }
            }
            ++iter;
        }
        EXPECT_LT(iter, maxWaitMs);
    }
}

TEST_F(GptExecutorTest, LogitsPostProcessorThrow)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    std::string const logitsProcessorName = "UnExistProcessor";

    auto request
        = Request(VecTokens(10, 1), 10, false, tensorrt_llm::executor::SamplingConfig(beamWidth), OutputConfig());
    request.setLogitsPostProcessorName(logitsProcessorName);
    EXPECT_THROW({ auto reqId = executor.enqueueRequest(std::move(request)); }, tensorrt_llm::common::TllmException);
}

static Response executeDraftRequest(Executor& executor)
{
    OutputConfig outputConfig;
    outputConfig.returnGenerationLogits = true;

    // Create the request
    SizeType32 maxNewTokens = 4;
    VecTokens inputTokens{1, 2, 3, 4};

    Request request{std::move(inputTokens), maxNewTokens};
    request.setOutputConfig(outputConfig);

    // Enqueue the request
    auto requestId = executor.enqueueRequest(std::move(request));

    // Wait for the response
    auto responses = executor.awaitResponses(requestId);

    return responses.at(0);
}

static Response executeTargetRequest(Executor& executor, Result const& draftResult)
{
    // Create the request
    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4};

    Request request{std::move(inputTokens), maxNewTokens};

    VecTokens const& outputTokenIds = draftResult.outputTokenIds.at(0);
    VecTokens draftTokens(outputTokenIds.end() - 4, outputTokenIds.end());

    auto const& logitsInfo = draftResult.specDecFastLogitsInfo.value();
    auto logitsTensor = logitsInfo.toTensor();

    ExternalDraftTokensConfig draftTokensConfig(
        std::move(draftTokens), logitsTensor, std::nullopt /* acceptance threshold */, true /* fastLogits */);
    request.setExternalDraftTokensConfig(draftTokensConfig);

    // Enqueue the request
    auto requestId = executor.enqueueRequest(std::move(request));

    // Wait for the response
    auto responses = executor.awaitResponses(requestId);

    return responses.at(0);
}

class SpeculativeDecodingTest : public GptExecutorTest
{
};

TEST_F(SpeculativeDecodingTest, SpecDecFastLogits)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtDraftEnginePath
        = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_GATHER_DIR() / "tp1-pp1-cp1-gpu";
    auto trtEnginePath
        = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DRAFT_TOKENS_DIR() / "tp1-pp1-cp1-gpu";

    FloatType freeGpuMemoryFraction = 0.3;
    auto kvCacheConfig
        = KvCacheConfig(true /* enableBlockReuse */, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
    executorConfig.setKvCacheConfig(kvCacheConfig);

    tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE);
    int const worldSize = tensorrt_llm::mpi::MpiComm::world().getSize();
    ASSERT_EQ(worldSize, 3);
    int const myRank = tensorrt_llm::mpi::MpiComm::world().getRank();
    bool const isOrchestrator = (myRank == 0);

    auto orchestratorConfig
        = OrchestratorConfig(isOrchestrator, "" /* workerExecutablePath */, nullptr, false /* spawnPrcesses */);
    auto parallelConfig = ParallelConfig(
        CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
    executorConfig.setParallelConfig(parallelConfig);

    auto specDecConfig = SpeculativeDecodingConfig(true /* fastLogits */);
    executorConfig.setSpecDecConfig(specDecConfig);

    std::unique_ptr<Executor> draftExecutor;
    std::unique_ptr<Executor> targetExecutor;

    if (isOrchestrator)
    {
        auto executorConfigDraft = executorConfig;
        parallelConfig.setParticipantIds({1});
        executorConfigDraft.setParallelConfig(parallelConfig);

        draftExecutor = std::make_unique<Executor>(trtDraftEnginePath, ModelType::kDECODER_ONLY, executorConfigDraft);

        parallelConfig.setParticipantIds({2});
        executorConfig.setParallelConfig(parallelConfig);

        targetExecutor = std::make_unique<Executor>(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
    }
    else if (myRank == 1) // draft model process
    {
        parallelConfig.setParticipantIds({1});
        parallelConfig.setDeviceIds({0});
        executorConfig.setParallelConfig(parallelConfig);
        executorConfig.setGatherGenerationLogits(true);
        draftExecutor = std::make_unique<Executor>(trtDraftEnginePath, ModelType::kDECODER_ONLY, executorConfig);
    }
    else if (myRank == 2) // target model process
    {
        parallelConfig.setParticipantIds({2});
        parallelConfig.setDeviceIds({0});
        executorConfig.setParallelConfig(parallelConfig);
        draftExecutor = std::make_unique<Executor>(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
    }

    if (isOrchestrator)
    {
        auto response = executeDraftRequest(*draftExecutor);
        ASSERT_FALSE(response.hasError());
        response = executeTargetRequest(*targetExecutor, response.getResult());
        ASSERT_FALSE(response.hasError());
    }
}

TEST_F(GptExecutorTest, OrchestratorMaxQueueSize)
{
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    SizeType32 maxQueueSize = 6;
    ExecutorConfig executorConfig;
    executorConfig.setMaxQueueSize(maxQueueSize);
    auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
    auto parallelConfig = ParallelConfig(
        CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
    executorConfig.setParallelConfig(parallelConfig);

    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    SizeType32 maxNewTokens = 100;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request = Request(inputTokens, maxNewTokens);
    std::vector<IdType> requestIds;
    auto numberOfRequests = maxQueueSize * 5;
    requestIds.reserve(numberOfRequests);

    // Enqueue more requests than the queue can manage
    for (int i = 0; i < numberOfRequests; i++)
    {
        auto requestId = executor.enqueueRequest(request);
        requestIds.emplace_back(requestId);
    }

    auto responseVectors = executor.awaitResponses(std::move(requestIds));
    bool failedWithFullQueue = false;
    for (auto& responseVector : responseVectors)
    {
        for (auto& response : responseVector)
        {
            if (response.hasError())
            {
                EXPECT_THAT(response.getErrorMsg(),
                    testing::HasSubstr("Maximum queue size of 6 has been reached, please try again later"));
                failedWithFullQueue = true;
            }
        }
    }
    EXPECT_TRUE(failedWithFullQueue) << "Expected requests to fail due to maximum queue size reached";

    // Wait for requests to get scheduled to free up space in queue
    std::this_thread::sleep_for(std::chrono::milliseconds(maxQueueSize * 200));
    auto requestId = executor.enqueueRequest(std::move(request));
    auto responses = executor.awaitResponses(requestId);
    for (auto& response : responses)
    {
        EXPECT_FALSE(response.hasError());
    }
}

TEST_F(GptExecutorTest, SingleRequestInvalidInputs)
{
    bool streaming = true;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4};

    std::vector<std::string> expectedErrMsgs;
    std::vector<Request> requests;

    // Invalid embedding bias shape
    {
        requests.emplace_back(inputTokens, maxNewTokens, streaming);
        auto embeddingBias = Tensor::cpu(DataType::kFP32, {1});
        requests.back().setEmbeddingBias(embeddingBias);
        expectedErrMsgs.emplace_back("embedding bias shape is not as expected");
    }

    for (auto req = 0; req < requests.size(); ++req)
    {
        auto& request = requests.at(req);
        auto const& expectedErrMsg = expectedErrMsgs.at(req);

        auto requestId = executor.enqueueRequest(std::move(request));

        // Try to get the new tokens
        bool done = false;
        int iter = 0;
        while (!done && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(requestId, waitTime);
            for (auto& response : responses)
            {
                if (response.hasError())
                {

                    auto err = response.getErrorMsg();
                    EXPECT_THAT(err, testing::HasSubstr(expectedErrMsg));
                    done = true;
                }
                else
                {
                    FAIL() << "Expected an err: " << expectedErrMsg;
                }
            }
            ++iter;
        }
        EXPECT_EQ(done, true);
    }
}

TEST_F(GptExecutorTest, ExecutorKVCacheManager)
{

    bool streaming = true;
    int numRequests = 3;

    SizeType32 beamWidth = 1;
    SizeType32 maxNewTokens = 5;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto kvCacheConfig = KvCacheConfig(true, 128);
    kvCacheConfig.setEventBufferMaxSize(1024);
    executorConfig.setKvCacheConfig(kvCacheConfig);

    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    auto kvCacheManager = *executor.getKVCacheEventManager();

    // Created event should be available before any requests.
    auto events = kvCacheManager->getLatestEvents(std::chrono::seconds(1));
    EXPECT_EQ(events.size(), 1);
    EXPECT_TRUE(std::holds_alternative<KVCacheCreatedData>(events.front().data));

    // Create requests
    std::vector<Request> requests;
    for (int request = 0; request < 3; request++)
    {
        VecTokens inputTokens;
        for (int i = 0; i < 63; i++)
        {
            inputTokens.emplace_back(i + request);
        }
        requests.emplace_back(inputTokens, maxNewTokens, streaming);
    }

    for (auto req = 0; req < requests.size(); ++req)
    {
        auto& request = requests.at(req);

        auto requestId = executor.enqueueRequest(std::move(request));

        // Get the new tokens
        bool done = false;
        int iter = 0;
        while (!done && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(requestId, waitTime);
            for (auto& response : responses)
            {
                if (response.hasError())
                {
                    // This request failed for some reason, get error msg
                    std::string errStr
                        = "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg();
                    FAIL();
                }
                else
                {
                    auto result = response.getResult();
                    done = result.isFinal;
                    if (done)
                    {
                        std::this_thread::sleep_for(std::chrono::milliseconds(100));
                        auto events = kvCacheManager->getLatestEvents(std::chrono::milliseconds(100));
                        if (req == 0)
                        {
                            EXPECT_EQ(events.size(), 2);

                            // Store the first context block
                            EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).parentHash, std::nullopt);
                            EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks.size(), 1);
                            // Store the second (now completed) context block and the partial decode block.
                            EXPECT_EQ(std::get<KVCacheStoredData>(events.back().data).blocks.size(), 2);
                            EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks[0].blockHash,
                                std::get<KVCacheStoredData>(events.back().data).parentHash);
                        }
                        else
                        {
                            EXPECT_EQ(events.size(), 4);

                            // Remove a block to make room for the second context block. On the second request, we need
                            // to remove 2 blocks.
                            EXPECT_EQ(std::get<KVCacheRemovedData>(events.front().data).blockHashes.size(), req);
                            events.pop_front();
                            // Store the first filled context block
                            EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks.size(), 1);
                            events.pop_front();
                            // Remove a block for the decode phase
                            EXPECT_EQ(std::get<KVCacheRemovedData>(events.front().data).blockHashes.size(), 1);
                            events.pop_front();
                            // Store the final context block and the decode block
                            EXPECT_EQ(std::get<KVCacheStoredData>(events.front().data).blocks.size(), 2);
                        }
                    }
                }
            }
            iter++;
        }
        EXPECT_EQ(done, true);
    }
}

TEST_F(GptExecutorTest, SingleRequestLora)
{
    bool streaming = true;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Load lora weights, config
    auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
    auto loraWeightsTensor
        = std::shared_ptr(tr::utils::loadNpy(manager, LORA_WEIGHTS_FILE.string(), tr::MemoryType::kCPU));
    auto loraConfigTensor
        = std::shared_ptr(tr::utils::loadNpy(manager, LORA_CONFIG_FILE.string(), tr::MemoryType::kCPU));

    // Create the request
    SizeType32 maxNewTokens = 5;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig());
    auto loraConfig = LoraConfig(0, detail::ofITensor(loraWeightsTensor), detail::ofITensor(loraConfigTensor));
    request.setLoraConfig(loraConfig);

    // Enqueue the request
    auto requestId = executor.enqueueRequest(std::move(request));

    // Get the new tokens
    VecTokens tokens;
    bool done = false;
    int iter = 0;
    std::chrono::milliseconds waitTime(1);
    while (!done && iter < mMaxWaitMs)
    {
        auto responses = executor.awaitResponses(requestId, waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                // This request failed for some reason, get error msg
                std::string errStr
                    = "Request id " + std::to_string(requestId) + " failed with err " + response.getErrorMsg();
                FAIL();
            }
            else
            {
                auto result = response.getResult();
                done = result.isFinal;
                // Append tokens
                auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
                tokens.insert(
                    tokens.end(), std::make_move_iterator(newTokens.begin()), std::make_move_iterator(newTokens.end()));
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);
    EXPECT_EQ(tokens.size(), maxNewTokens);
}

TEST_P(GuidedDecodingParamsTest, All)
{
    auto const modelName = std::get<0>(GetParam());
    std::filesystem::path enginePath;
    std::filesystem::path tokenizerInfoPath;
    int tp_size = 1, pp_size = 1, cp_size = 1;
    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

    if (modelName == "gpt")
    {
        enginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
        tokenizerInfoPath = GPT_XGRAMMAR_TOKENIZER_INFO_PATH;
    }
    else if (modelName == "llama_tp1_pp1_cp1")
    {
        enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
        tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
    }
    else if (modelName == "llama_tp4_pp1_cp1")
    {
        enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
        tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
        tp_size = 4;
    }
    else if (modelName == "llama_tp1_pp4_cp1")
    {
        enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
        tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
        pp_size = 4;
        deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
    }
    else if (modelName == "llama_tp2_pp2_cp1")
    {
        enginePath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
        tokenizerInfoPath = LLAMA_XGRAMMAR_TOKENIZER_INFO_PATH;
        tp_size = 2;
        pp_size = 2;
        deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
    }
    else
    {
        TLLM_THROW("Unrecognized modelName");
    }

    auto& comm = tensorrt_llm::mpi::MpiComm::world();
    auto const worldRank = comm.getRank();
    auto const worldSize = comm.getSize();

    if (tp_size * pp_size * cp_size > 1)
    {
        // Run multi GPU test only when env variable is set
        char const* val = getenv("RUN_LLAMA_MULTI_GPU");
        if (val == NULL)
        {
            GTEST_SKIP() << "Skipping multi-gpu guided decoding test";
        }
        else
        {
            if (worldSize != 4)
            {
                FAIL() << "Leader mode and world size is not equal to 4";
            }
        }
    }

    bool streaming = false;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);

    auto const tokenizerInfo = nlohmann::json::parse(std::ifstream{tokenizerInfoPath});
    auto const encodedVocab = tokenizerInfo["encoded_vocab"].template get<std::vector<std::string>>();
    auto const tokenizerStr = tokenizerInfo["tokenizer_str"].template get<std::string>();
    auto const stopTokenIds = tokenizerInfo["stop_token_ids"].template get<std::vector<TokenIdType>>();
    GuidedDecodingConfig guidedDecodingConfig(
        GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR, encodedVocab, tokenizerStr, stopTokenIds);
    executorConfig.setGuidedDecodingConfig(guidedDecodingConfig);

    if (deviceIds.has_value())
    {
        auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());

        parallelConfig.setDeviceIds(deviceIds.value());
        executorConfig.setParallelConfig(parallelConfig);
    }
    auto executor = Executor(enginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the requests
    VecTokens inputTokens;
    if (modelName == "gpt")
    {
        inputTokens = {2061, 318, 352, 10, 16, 30, 23998, 39559, 287, 257, 8633, 287, 33918, 5794, 25, 220};
    }
    else // llama
    {
        inputTokens = {
            128000, 62, 3923, 7037, 62, 16, 10, 16, 30, 62, 16533, 87710, 1265, 4404, 5356, 1265, 9643, 9132, 25, 62};
    }
    SizeType32 maxNewTokens = 10;
    SamplingConfig samplingConfig{};
    OutputConfig outputConfig{false, false, false, true};

    std::vector<Request> requests;
    requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);

    requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
    requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON));

    requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
    std::string jsonSchema{
        R"({"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"})"};
    requests.back().setGuidedDecodingParams(
        GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON_SCHEMA, jsonSchema));

    requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
    std::string regex{R"(\d+)"};
    requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kREGEX, regex));

    requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
    std::string ebnfGrammar{R"(root ::= [0-9]+)"};
    requests.back().setGuidedDecodingParams(
        GuidedDecodingParams(GuidedDecodingParams::GuideType::kEBNF_GRAMMAR, ebnfGrammar));

    std::vector<VecTokens> expectedOutputTokens;
    if (modelName == "gpt")
    {
        expectedOutputTokens.push_back({1849, 7, 16, 10, 16, 8, 198, 16, 10, 16});
        expectedOutputTokens.push_back({90, 366, 3672, 1298, 366, 7554, 31780, 1600, 366, 12888});
        expectedOutputTokens.push_back({90, 366, 64, 77, 2032, 68, 81, 1, 1058, 352});
        expectedOutputTokens.push_back({25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645});
        expectedOutputTokens.push_back({25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645, 25645});
    }
    else // llama
    {
        expectedOutputTokens.push_back({16, 10, 16, 28, 17, 198, 62, 3923, 7037, 62});
        expectedOutputTokens.push_back({5018, 16, 794, 330, 16, 498, 330, 17, 794, 330});
        expectedOutputTokens.push_back({5018, 9399, 794, 16, 92});
        expectedOutputTokens.push_back({16});
        expectedOutputTokens.push_back({16});
    }

    if (executor.canEnqueueRequests())
    {
        // Enqueue the requests
        auto reqIds = executor.enqueueRequests(std::move(requests));

        // Get the responses
        int numFinished = 0;
        int iter = 0;
        while (numFinished < 5 && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
            for (auto& response : responses)
            {
                auto reqId = response.getRequestId();
                if (response.hasError())
                {
                    // This request failed for some reason, get error msg
                    std::string errStr
                        = "Request id " + std::to_string(reqId) + " failed with err " + response.getErrorMsg();
                    FAIL();
                }
                else
                {
                    auto result = response.getResult();
                    auto& newTokens = result.outputTokenIds.at(0);

                    int reqIdx = std::find(reqIds.begin(), reqIds.end(), reqId) - reqIds.begin();
                    EXPECT_THAT(newTokens, ::testing::ElementsAreArray(expectedOutputTokens[reqIdx]));
                }
                numFinished++;
            }
        }
        EXPECT_LT(iter, mMaxWaitMs);
        EXPECT_EQ(numFinished, 5);
    }
}

TEST_F(GptExecutorTest, GuidedDecodingFailure)
{
    bool streaming = false;

    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);

    std::vector<int> stopTokenIds{50256};
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the requests
    SizeType32 maxNewTokens = 10;
    SamplingConfig samplingConfig{};
    OutputConfig outputConfig{false, false, false, true};
    VecTokens inputTokens{2061, 318, 352, 10, 16, 30, 23998, 39559, 287, 257, 8633, 287, 33918, 5794, 25, 220};

    std::vector<Request> requests;
    requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
    requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outputConfig, stopTokenIds[0]);
    requests.back().setGuidedDecodingParams(GuidedDecodingParams(GuidedDecodingParams::GuideType::kJSON));

    // Enqueue the requests
    auto reqIds = executor.enqueueRequests(std::move(requests));

    // Get the responses
    int numFinished = 0;
    int iter = 0;
    while (numFinished < 2 && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            auto reqId = response.getRequestId();
            int reqIdx = std::find(reqIds.begin(), reqIds.end(), reqId) - reqIds.begin();
            if (reqIdx == 0)
            {
                EXPECT_FALSE(response.hasError());
            }
            else
            {
                EXPECT_TRUE(response.hasError());
            }
            numFinished++;
        }
    }
    EXPECT_LT(iter, mMaxWaitMs);
    EXPECT_EQ(numFinished, 2);
}

TEST_P(ParamTest, SingleRequestCancelRequest)
{
    bool const streaming = std::get<0>(GetParam());
    bool const excludeInputFromOutput = std::get<1>(GetParam());
    auto const beamWidth = std::get<2>(GetParam());
    OutputConfig outConfig;
    outConfig.excludeInputFromOutput = excludeInputFromOutput;

    auto executorConfig = ExecutorConfig(beamWidth);
    auto trtEnginePath = GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    SizeType32 maxNewTokens = 300;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request
        = Request(inputTokens, maxNewTokens, streaming, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);

    auto requestId = executor.enqueueRequest(std::move(request));

    std::this_thread::sleep_for(std::chrono::milliseconds(100));
    executor.cancelRequest(requestId);

    // Try to get the new tokens
    bool done = false;
    int iter = 0;
    VecTokens tokens;
    while (!done && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(requestId, waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                FAIL() << "Did not expect errors";
            }
            else
            {
                auto result = response.getResult();
                done = result.isFinal;
                // Append tokens
                auto& newTokens = result.outputTokenIds.at(beamWidth - 1);
                if (done)
                {
                    for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
                    {
                        EXPECT_EQ(result.finishReasons[beamIdx], FinishReason::kCANCELLED);
                    }
                }

                if (streaming && beamWidth > 1)
                {
                    tokens = newTokens;
                }
                else
                {
                    tokens.insert(tokens.end(), newTokens.begin(), newTokens.end());
                }
            }
        }
        ++iter;
    }
    EXPECT_EQ(done, true);
    EXPECT_LT(iter, mMaxWaitMs);
    auto expectedNumTokens
        = streaming ? maxNewTokens : (excludeInputFromOutput ? 0 : inputTokens.size()) + maxNewTokens;
    TLLM_LOG_INFO("num tokens: %d, expected %d", tokens.size(), expectedNumTokens);
    EXPECT_LT(tokens.size(), expectedNumTokens);
}

TEST_F(GptExecutorTest, orchModeFetchNewReqErr)
{
    SizeType32 beamWidth = 1;
    auto executorConfig = ExecutorConfig(beamWidth);

    auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
    auto parallelConfig = ParallelConfig(
        CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
    executorConfig.setParallelConfig(parallelConfig);

    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Create a req with invalid parameters
    SizeType32 maxNewTokens = 5;
    // Create very long prompt which should result in error during request validate
    VecTokens inputTokens(10000000);

    auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
    auto requestId = executor.enqueueRequest(request);
    auto requestId2 = executor.enqueueRequest(request);

    bool done = false;
    int iter = 0;
    while (!done && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                auto err = response.getErrorMsg();
                EXPECT_THAT(err, testing::HasSubstr("exceeds maximum input length"));
                EXPECT_THAT(err, testing::HasSubstr("Encountered an error when fetching new request:"));
                done = true;
            }
            else
            {
                FAIL() << "Should get a response with error";
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);
}

TEST_F(GptExecutorTest, orchModeForwardError)
{
    SizeType32 constexpr maxBeamWidth{1};
    auto executorConfig = ExecutorConfig(maxBeamWidth);

    auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
    auto parallelConfig = ParallelConfig(
        CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR, std::nullopt, std::nullopt, orchestratorConfig);
    executorConfig.setParallelConfig(parallelConfig);

    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
    auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);

    // Setting request beam width to 2 which should cause failure
    SizeType32 constexpr beamWidth{2};
    SizeType32 constexpr maxNewTokens{5};
    VecTokens inputTokens{1, 2, 3, 4};

    auto request = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
    auto requestId = executor.enqueueRequest(request);
    auto requestId2 = executor.enqueueRequest(request);

    bool done = false;
    int iter = 0;
    while (!done && iter < mMaxWaitMs)
    {
        std::chrono::milliseconds waitTime(1);
        auto responses = executor.awaitResponses(waitTime);
        for (auto& response : responses)
        {
            if (response.hasError())
            {
                auto err = response.getErrorMsg();
                std::cout << "err:" << err << std::endl;
                EXPECT_THAT(
                    err, testing::HasSubstr("Requested beam width 2 is larger than configured max beam width 1"));
                done = true;
            }
            else
            {
                FAIL() << "Should get a response with error";
            }
        }
        ++iter;
    }
    EXPECT_LT(iter, mMaxWaitMs);
}

TEST_P(ParamCancelReqTest, MultipleRequestsMultiGpuCancelRequest)
{
    auto const useOrchestratorMode = std::get<0>(GetParam());
    auto const beamWidth = std::get<1>(GetParam());
    auto const modelName = std::get<2>(GetParam());

    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

    OutputConfig outConfig;

    auto executorConfig = ExecutorConfig(beamWidth);
    std::filesystem::path modelPath;
    if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
    {
        if (modelName == "llama_tp4_pp1_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
        }
        else if (modelName == "llama_tp1_pp4_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
            deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
        }
        else if (modelName == "llama_tp2_pp2_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
            deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
        }
    }

    // For llama model, only run for multiple GPUs
    // This is detected by setting an env variable when running the test
    char const* val = getenv("RUN_LLAMA_MULTI_GPU");
    if (val == NULL)
    {
        GTEST_SKIP() << "Skipping Llama test";
    }
    else
    {
        // Check that it was launched with right number of MPI ranks
        if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Leader mode and world size is not equal to 4";
        }
        else if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Orchestrator mode and World size is not equal to 1";
        }
    }

    if (useOrchestratorMode)
    {
        auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
        auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
            useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt,
            std::nullopt, orchestratorConfig);
        if (deviceIds.has_value())
        {
            parallelConfig.setDeviceIds(deviceIds.value());
        }
        executorConfig.setParallelConfig(parallelConfig);
    }
    else
    {
        if (deviceIds.has_value())
        {
            auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
            parallelConfig.setDeviceIds(deviceIds.value());
            executorConfig.setParallelConfig(parallelConfig);
        }
    }

    auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);

    // Create the request
    SizeType32 maxNewTokens = 50;
    VecTokens inputTokens{1, 2, 3, 4};

    std::vector<Request> requests;
    for (auto streaming : {false, true})
    {
        // Add two requests with numReturnSequences = 1
        auto samplingConfig = tensorrt_llm::executor::SamplingConfig(beamWidth);
        requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
        requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig);
        // Add a request with numReturnSequences > 1
        auto samplingConfig2 = tensorrt_llm::executor::SamplingConfig(beamWidth);
        auto constexpr numReturnSequences = 2;
        samplingConfig2.setNumReturnSequences(numReturnSequences);
        requests.emplace_back(inputTokens, maxNewTokens, streaming, samplingConfig2, outConfig);
    }
    std::vector<bool> cancelRequests{true, false, true, true, false, true};

    if (executor.canEnqueueRequests())
    {
        auto const requestIds = executor.enqueueRequests(requests);

        // Cancel the first and third requests
        std::this_thread::sleep_for(std::chrono::milliseconds(50));
        for (SizeType32 i = 0; i < requests.size(); i++)
        {
            if (cancelRequests.at(i))
            {
                executor.cancelRequest(requestIds.at(i));
            }
        }

        std::unordered_map<IdType, bool> isStreaming;
        std::unordered_map<IdType, SizeType32> expectedNumTokens;
        SizeType32 expectedNumResponses = 0;
        for (SizeType32 i = 0; i < requests.size(); i++)
        {
            auto const& request = requests.at(i);
            auto requestId = requestIds.at(i);
            isStreaming[requestId] = request.getStreaming();
            expectedNumTokens[requestId] = (request.getStreaming() ? 0 : inputTokens.size()) + maxNewTokens;
            auto const numResponses = request.getStreaming() ? expectedNumTokens[requestId] : 1;
            auto const numReturnSequences = request.getSamplingConfig().getBeamWidth() > 1
                ? 1
                : request.getSamplingConfig().getNumReturnSequences().value_or(1);
            expectedNumResponses += numResponses * numReturnSequences;
        }

        std::unordered_map<IdType, std::unordered_map<SizeType32, VecTokens>> tokens;

        // Get the new tokens for each requests
        int32_t numFinished = 0;
        int iter = 0;
        SizeType32 numResponses = 0;
        while (numFinished < requests.size() && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
            for (auto& response : responses)
            {
                numResponses++;
                if (!response.hasError())
                {
                    auto requestId = response.getRequestId();
                    auto result = response.getResult();
                    numFinished += result.isFinal;
                    auto seqIdx = result.sequenceIndex;
                    auto numSequences = result.outputTokenIds.size();
                    auto& newTokens = result.outputTokenIds.at(numSequences - 1);
                    auto& reqResults = tokens[response.getRequestId()];
                    auto& reqTokens = reqResults[seqIdx];
                    if (isStreaming.at(requestId) && beamWidth > 1)
                    {
                        reqTokens = newTokens;
                    }
                    else
                    {
                        reqTokens.insert(reqTokens.end(), newTokens.begin(), newTokens.end());
                    }
                }
                else
                {
                    FAIL() << "Did not expect errors";
                }
            }
            ++iter;
        }

        EXPECT_LE(numResponses, expectedNumResponses);
        EXPECT_EQ(numFinished, requests.size());
        EXPECT_LT(iter, mMaxWaitMs);

        for (auto requestIdx = 0; requestIdx < requests.size(); requestIdx++)
        {
            auto const requestId = requestIds.at(requestIdx);
            for (auto seqIdx = 0; seqIdx < tokens.at(requestId).size(); seqIdx++)
            {
                auto const& seqTokens = tokens.at(requestId).at(seqIdx);
                if (cancelRequests.at(requestIdx))
                {
                    EXPECT_LT(seqTokens.size(), expectedNumTokens.at(requestId));
                }
                else
                {
                    EXPECT_EQ(seqTokens.size(), expectedNumTokens.at(requestId));
                }
            }
        }
    }
}

TEST_P(LeaderApiUsageTest, LeaderModeTest)
{
    auto const modelName = std::get<0>(GetParam());

    SizeType32 beamWidth = 2;
    OutputConfig outConfig;
    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

    auto executorConfig = ExecutorConfig(beamWidth);
    std::filesystem::path modelPath;
    if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
    {
        if (modelName == "llama_tp4_pp1_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
        }
        else if (modelName == "llama_tp1_pp4_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
            deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
        }
        else if (modelName == "llama_tp2_pp2_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
            deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
        }
    }

    // For llama model, only run for multiple GPUs
    // This is detected by setting an env variable when running the test
    char const* val = getenv("RUN_LLAMA_MULTI_GPU");
    if (val == NULL)
    {
        GTEST_SKIP() << "Skipping Llama test";
    }
    else
    {
        // Check that it was launched with right number of MPI ranks
        if (COMM_SESSION.getSize() != 4)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Leader mode and world size is not equal to 4";
        }
    }

    if (deviceIds.has_value())
    {
        auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());

        parallelConfig.setDeviceIds(deviceIds.value());
        executorConfig.setParallelConfig(parallelConfig);
    }
    auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);

    // Since this is leader mode, all ranks should participate
    EXPECT_TRUE(executor.isParticipant());

    // Create the request
    SizeType32 maxNewTokens = 50;
    VecTokens inputTokens{1, 2, 3, 4};
    auto request
        = Request(inputTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);
    auto requestStreaming
        = Request(inputTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth), outConfig);

    // Leader enqueues requests and wait for responses
    if (executor.canEnqueueRequests())
    {
        auto requestId = executor.enqueueRequest(request);
        auto requestId2 = executor.enqueueRequest(request);
        auto requestId3 = executor.enqueueRequest(requestStreaming);
        auto requestId4 = executor.enqueueRequest(requestStreaming);

        int32_t numFinished = 0;
        int iter = 0;
        SizeType32 numResponses = 0;
        while (numFinished < 4 && iter < mMaxWaitMs)
        {
            std::chrono::milliseconds waitTime(1);
            auto responses = executor.awaitResponses(waitTime);
            for (auto& response : responses)
            {
                numResponses++;
                if (!response.hasError())
                {
                    auto result = response.getResult();
                    numFinished += result.isFinal;
                }
                else
                {
                    FAIL() << "Did not expect errors";
                }
            }
            ++iter;
        }
        EXPECT_EQ(numFinished, 4);
        EXPECT_LT(iter, mMaxWaitMs);
    }
    else
    {
        // Check that non-leader cannot enqueue requests
        EXPECT_THROW({ auto reqId = executor.enqueueRequest(request); }, tensorrt_llm::common::TllmException);
        EXPECT_THROW({ auto responses = executor.awaitResponses(); }, tensorrt_llm::common::TllmException);
        EXPECT_THROW({ auto numResp = executor.getNumResponsesReady(); }, tensorrt_llm::common::TllmException);
        EXPECT_THROW({ executor.cancelRequest(1); }, tensorrt_llm::common::TllmException);
        EXPECT_THROW({ auto stats = executor.getLatestIterationStats(); }, tensorrt_llm::common::TllmException);
        EXPECT_THROW({ auto stats = executor.getLatestRequestStats(); }, tensorrt_llm::common::TllmException);
    }
}

TEST_F(GptExecutorTest, validateParallelConfig)
{

    auto trtEnginePath = (GPT_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu");
    {
        auto executorConfig = ExecutorConfig();
        auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
    }

    {
        std::string expectedErrMsg = "OrchestratorConfig must be set";
        try
        {
            auto executorConfig = ExecutorConfig();
            auto parallelConfig = ParallelConfig(CommunicationType::kMPI, CommunicationMode::kORCHESTRATOR);
            executorConfig.setParallelConfig(parallelConfig);
            auto executor = Executor(trtEnginePath, ModelType::kDECODER_ONLY, executorConfig);
            FAIL() << "Expected TllmException";
        }
        catch (tc::TllmException& e)
        {
            EXPECT_THAT(e.what(), testing::HasSubstr(expectedErrMsg));
        }
        catch (std::exception const& e)
        {
            FAIL() << "Expected TllmException";
        }
    }
}

TEST_P(TimeoutTest, TimeoutStreamingTest)
{
    auto const modelName = std::get<0>(GetParam());
    auto const useOrchestratorMode = std::get<1>(GetParam());
    auto const beamWidth = std::get<2>(GetParam());

    auto executorConfig = ExecutorConfig(beamWidth);
    std::filesystem::path modelPath;
    bool isMultiGpu{false};
    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

    if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
    {
        isMultiGpu = true;
        if (modelName == "llama_tp4_pp1_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
        }
        else if (modelName == "llama_tp1_pp4_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
            deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
        }
        else if (modelName == "llama_tp2_pp2_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
            deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
        }
    }
    if (modelName == "llama_tp1_pp1_cp1")
    {
        modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    }
    // For llama model, only run for multiple GPUs
    // This is detected by setting an env variable when running the test
    char const* val = getenv("RUN_LLAMA_MULTI_GPU");
    if (val == NULL && isMultiGpu)
    {
        GTEST_SKIP() << "Skipping MultiGpu tests";
    }
    if (val != NULL && !isMultiGpu)
    {
        GTEST_SKIP() << "Skipping SingleGpu tests";
    }
    if (val != NULL && isMultiGpu)
    {
        // Check that it was launched with right number of MPI ranks
        if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Leader mode and world size is not equal to 4";
        }
        if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Orchestrator mode and World size is not equal to 1";
        }
    }

    if (useOrchestratorMode)
    {
        auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
        auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
            useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt,
            std::nullopt, orchestratorConfig);
        executorConfig.setParallelConfig(parallelConfig);
        if (deviceIds.has_value())
        {
            parallelConfig.setDeviceIds(deviceIds.value());
        }
        executorConfig.setParallelConfig(parallelConfig);
    }
    else
    {
        if (deviceIds.has_value())
        {
            auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
            parallelConfig.setDeviceIds(deviceIds.value());
            executorConfig.setParallelConfig(parallelConfig);
        }
    }
    auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 constexpr maxNewTokens = 10;
    // create 1 request that times out immediately
    // momentarily we don't cancel requests before forwardAsync so it will get scheduled for at least 1 forward
    VecTokens immediateCancelTokens{1, 2, 3, 4};
    auto immediateCancelRequest
        = Request(immediateCancelTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth));
    immediateCancelRequest.setReturnAllGeneratedTokens(true);
    immediateCancelRequest.setAllottedTimeMs(std::chrono::milliseconds(0));
    SizeType32 constexpr immediateCancelMinLength = 0;
    SizeType32 constexpr immediateCancelMaxLength = 1;

    // create 1 request that times out during the first forward
    VecTokens oneForwardTokens{11, 12, 13, 14};
    auto oneForwardRequest
        = Request(oneForwardTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth));
    oneForwardRequest.setReturnAllGeneratedTokens(true);
    oneForwardRequest.setAllottedTimeMs(std::chrono::milliseconds(1));
    SizeType32 constexpr oneForwardlMinLength = 0;
    SizeType32 constexpr oneForwardlMaxLength = 1;

    // Create the request that finishes by the number of tokens
    VecTokens finishedTokens{101, 102, 103, 104};
    auto finishedRequest
        = Request(finishedTokens, maxNewTokens, true, tensorrt_llm::executor::SamplingConfig(beamWidth));
    finishedRequest.setReturnAllGeneratedTokens(true);
    finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(5000));
    SizeType32 constexpr finishedMinLength = 5;
    SizeType32 constexpr finishedMaxLength = maxNewTokens;

    std::vector<FinishReason> referenceFinishReasons
        = {FinishReason::kTIMED_OUT, FinishReason::kTIMED_OUT, FinishReason::kLENGTH};
    std::vector<SizeType32> minLengths = {immediateCancelMinLength, oneForwardlMinLength, finishedMinLength};
    std::vector<SizeType32> maxLengths = {immediateCancelMaxLength, oneForwardlMaxLength, finishedMaxLength};
    // workaround because the last response will be empty, but we want to have at least *some* responses surpass the
    // minLength
    std::vector<SizeType32> achievedLength = {0, 0, 0};
    SizeType32 itNr{0};

    if (executor.canEnqueueRequests())
    {

        std::vector<Request> requests = {immediateCancelRequest, oneForwardRequest, finishedRequest};
        auto requestIds = executor.enqueueRequests(requests);

        auto numFinished = 0;

        while (numFinished < static_cast<SizeType32>(requests.size()))
        {
            itNr++;
            std::chrono::milliseconds waitTime(mMaxWaitMs);
            auto responses = executor.awaitResponses(requestIds, waitTime);
            for (auto const& response : responses)
            {
                for (auto const& responseIt : response)
                {
                    auto const reqId = responseIt.getRequestId();
                    if (responseIt.hasError())
                    {
                        // Allow response with error only if awaitResponse processed a terminated request id
                        std::string err
                            = "ReqId " + std::to_string(reqId) + " has already been processed and was terminated.";
                        if (responseIt.getErrorMsg() != err)
                        {
                            TLLM_THROW("Request id %lu encountered error: %s", reqId, responseIt.getErrorMsg().c_str());
                        }
                        continue;
                    }

                    auto const& result = responseIt.getResult();
                    if (result.isFinal)
                    {
                        requestIds.erase(std::remove(requestIds.begin(), requestIds.end(), reqId), requestIds.end());
                        numFinished++;
                    }

                    auto const finishReason = result.finishReasons;
                    auto const actualResponse = result.outputTokenIds;
                    TLLM_LOG_DEBUG("reqId %d finished %d", reqId, result.isFinal);
                    TLLM_LOG_DEBUG("actual response:");

                    for (auto const& beam : actualResponse)
                    {
                        std::string tokenStr;
                        for (auto tok : beam)
                        {
                            tokenStr += std::to_string(tok) + " ";
                        }
                        TLLM_LOG_DEBUG("%s", tokenStr.c_str());
                    }

                    TLLM_LOG_DEBUG(
                        "beams' length must be in range [%d, %d]", minLengths[reqId - 1], maxLengths[reqId - 1]);

                    if (result.isFinal)
                    {
                        TLLM_LOG_DEBUG("finishReason");
                        std::string reasonStr;
                        for (auto const reason : finishReason)
                        {
                            // cast for easier visibility during debugging
                            EXPECT_EQ(static_cast<int>(reason), static_cast<int>(referenceFinishReasons[reqId - 1]));
                            reasonStr += std::to_string(static_cast<int>(reason)) + " ";
                        }
                        TLLM_LOG_DEBUG("%s", reasonStr.c_str());
                    }

                    EXPECT_EQ(beamWidth, actualResponse.size());
                    for (int beam = 0; beam < beamWidth; beam++)
                    {
                        EXPECT_LE(actualResponse.at(beam).size(), maxLengths[reqId - 1]) << "for request " << reqId;
                        achievedLength[reqId - 1] = std::max(
                            achievedLength[reqId - 1], static_cast<SizeType32>(actualResponse.at(beam).size()));
                    }
                }
            }
        }

        for (int reqIt = 0; reqIt < achievedLength.size(); ++reqIt)
        {
            EXPECT_GE(achievedLength[reqIt], minLengths[reqIt])
                << "request " << reqIt + 1 << " has not achieved min lengths";
        }
    }
}

TEST_P(TimeoutTest, TimeoutNonstreamingTest)
{
    auto const modelName = std::get<0>(GetParam());
    auto const useOrchestratorMode = std::get<1>(GetParam());
    auto const beamWidth = std::get<2>(GetParam());

    std::optional<std::vector<SizeType32>> deviceIds = std::nullopt;

    auto executorConfig = ExecutorConfig(beamWidth);
    std::filesystem::path modelPath;
    bool isMultiGpu{false};
    if (modelName == "llama_tp4_pp1_cp1" || modelName == "llama_tp1_pp4_cp1" || modelName == "llama_tp2_pp2_cp1")
    {
        isMultiGpu = true;
        if (modelName == "llama_tp4_pp1_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp4-pp1-cp1-gpu";
        }
        else if (modelName == "llama_tp1_pp4_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp4-cp1-gpu";
            deviceIds = std::vector<SizeType32>{3, 2, 1, 0};
        }
        else if (modelName == "llama_tp2_pp2_cp1")
        {
            modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp2-pp2-cp1-gpu";
            deviceIds = std::vector<SizeType32>{2, 3, 0, 1};
        }
    }
    if (modelName == "llama_tp1_pp1_cp1")
    {
        modelPath = LLAMA_MODEL_PATH / PathUtil::FP16_GPT_ATTENTION_PACKED_PAGED_DIR() / "tp1-pp1-cp1-gpu";
    }
    // For llama model, only run for multiple GPUs
    // This is detected by setting an env variable when running the test
    char const* val = getenv("RUN_LLAMA_MULTI_GPU");
    if (val == NULL && isMultiGpu)
    {
        GTEST_SKIP() << "Skipping MultiGpu tests";
    }
    if (val != NULL && !isMultiGpu)
    {
        GTEST_SKIP() << "Skipping SingleGpu tests";
    }
    if (val != NULL && isMultiGpu)
    {
        // Check that it was launched with right number of MPI ranks
        if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Leader mode and world size is not equal to 4";
        }
        if (useOrchestratorMode && COMM_SESSION.getSize() != 1)
        {
            // No orchestrator, need worldSize to match TP*PP
            FAIL() << "Orchestrator mode and World size is not equal to 1";
        }
    }

    if (useOrchestratorMode)
    {
        auto orchestratorConfig = OrchestratorConfig(true, PathUtil::EXECUTOR_WORKER_PATH());
        auto parallelConfig = ParallelConfig(CommunicationType::kMPI,
            useOrchestratorMode ? CommunicationMode::kORCHESTRATOR : CommunicationMode::kLEADER, std::nullopt,
            std::nullopt, orchestratorConfig);
        executorConfig.setParallelConfig(parallelConfig);
        if (deviceIds.has_value())
        {
            parallelConfig.setDeviceIds(deviceIds.value());
        }
        executorConfig.setParallelConfig(parallelConfig);
    }
    else
    {
        if (deviceIds.has_value())
        {
            auto parallelConfig = executorConfig.getParallelConfig().value_or(ParallelConfig());
            parallelConfig.setDeviceIds(deviceIds.value());
            executorConfig.setParallelConfig(parallelConfig);
        }
    }
    auto executor = Executor(modelPath, ModelType::kDECODER_ONLY, executorConfig);

    SizeType32 constexpr maxNewTokens = 5;
    // create 1 request that times out immediately
    // momentarily we don't cancel requests before forwardAsync so it will get scheduled for at least 1 forward
    VecTokens immediateCancelTokens{1, 2, 3, 4};
    auto immediateCancelRequest
        = Request(immediateCancelTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
    immediateCancelRequest.setAllottedTimeMs(std::chrono::milliseconds(0));
    std::vector<std::vector<int>> immediateCancelResponse = {immediateCancelTokens, immediateCancelTokens};

    // create 1 request that times out during the first forward
    VecTokens oneForwardTokens{11, 12, 13, 14};
    auto oneForwardRequest
        = Request(oneForwardTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
    oneForwardRequest.setAllottedTimeMs(std::chrono::milliseconds(1));
    std::vector<std::vector<int>> oneForwardResponse = {oneForwardTokens, oneForwardTokens};

    // Create the request that finishes by the number of tokens
    VecTokens finishedTokens{101, 102, 103, 104};
    auto finishedRequest
        = Request(finishedTokens, maxNewTokens, false, tensorrt_llm::executor::SamplingConfig(beamWidth));
    finishedRequest.setAllottedTimeMs(std::chrono::milliseconds(6000));
    std::vector<std::vector<int>> finishedReponse
        = {{101, 102, 103, 104, 49849, 225, 49849, 232, 55742}, {101, 102, 103, 104, 49849, 225, 49849, 232, 29082}};

    // assume responses will come in FIFO order
    std::vector<BeamTokens> refResponses = {immediateCancelResponse, oneForwardResponse, finishedReponse};
    std::vector<FinishReason> referenceFinishReasons
        = {FinishReason::kTIMED_OUT, FinishReason::kTIMED_OUT, FinishReason::kLENGTH};
    if (executor.canEnqueueRequests())
    {

        std::vector<Request> requests = {immediateCancelRequest, oneForwardRequest, finishedRequest};
        auto requestIds = executor.enqueueRequests(requests);

        std::chrono::milliseconds waitTime(mMaxWaitMs);
        auto responses = executor.awaitResponses(requestIds, waitTime);
        for (auto const& response : responses)
        {
            for (auto const& responseIt : response)
            {
                auto const reqId = responseIt.getRequestId();
                if (responseIt.hasError())
                {
                    TLLM_THROW("Request id %lu encountered error: %s", reqId, responseIt.getErrorMsg().c_str());
                }

                auto const& result = responseIt.getResult();

                auto const finishReason = result.finishReasons;
                auto const actualResponse = result.outputTokenIds;
                TLLM_LOG_DEBUG("reqId %d finished %d", reqId, result.isFinal);
                TLLM_LOG_DEBUG("actual response:");

                for (auto const& beam : actualResponse)
                {
                    std::string tokenStr;
                    for (auto tok : beam)
                    {
                        tokenStr += std::to_string(tok) + " ";
                    }
                    TLLM_LOG_DEBUG("%s", tokenStr.c_str());
                }

                TLLM_LOG_DEBUG("reference:");
                auto referenceResponse = refResponses[reqId - 1];
                for (auto const& beam : referenceResponse)
                {
                    std::string tokenStr;
                    for (auto tok : beam)
                    {
                        tokenStr += std::to_string(tok) + " ";
                    }
                    TLLM_LOG_DEBUG("%s", tokenStr.c_str());
                }

                if (result.isFinal)
                {
                    TLLM_LOG_DEBUG("finishReason");
                    std::string reasonStr;
                    for (auto const reason : finishReason)
                    {
                        // cast for easier visibility during debugging
                        EXPECT_EQ(static_cast<int>(reason), static_cast<int>(referenceFinishReasons[reqId - 1]));
                        reasonStr += std::to_string(static_cast<int>(reason)) + " ";
                    }
                    TLLM_LOG_DEBUG("%s", reasonStr.c_str());
                }

                EXPECT_EQ(beamWidth, actualResponse.size());
                for (int beam = 0; beam < beamWidth; beam++)
                {
                    EXPECT_EQ(referenceResponse.at(beam).size(), actualResponse.at(beam).size());
                    EXPECT_THAT(actualResponse.at(beam), testing::ElementsAreArray(referenceResponse.at(beam)));
                }
            }
        }
    }
}

INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamTest,
    testing::Combine(                 //
        testing::Values(false, true), // streaming
        testing::Values(false, true), // excludeInputFromOutput
        testing::Values(1, 2)         // beamWidth
        ),
    generateTestName);

INSTANTIATE_TEST_SUITE_P(GptExecutorTest, ParamStatsTest,
    testing::Combine(                //
        testing::Values(0, 1000),    // iterStatsMaxIterations
        testing::Values(false, true) // useOrchestratorMode
        ),
    generateTestNameStats);

INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, ParamCancelReqTest,
    testing::Combine(                                                                  //
        testing::Values(false, true),                                                  // useOrchestratorMode
        testing::Values(1, 2),                                                         // beamWidth
        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
        ),
    generateTestNameCancelReq);

INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, TimeoutTest,
    testing::Combine(                                                                   //
        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp1_pp1_cp1"), // modelName
        testing::Values(false, true),                                                   // useOrchestratorMode
        testing::Values(2)                                                              // beamWidth
        ),
    generateTestNameTimeoutTest);

INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LeaderApiUsageTest,
    testing::Combine(                                                                  //
        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1") // modelName
        ),
    generateTestNameLeaderApiUsage);

INSTANTIATE_TEST_SUITE_P(GptExecutorTest, AllParamsTest,
    testing::Combine(                 //
        testing::Values(false, true), // streaming
        testing::Values(1, 2),        // beamWidth
        testing::Values(true),        // computeLogProbs
        testing::Values(false, true), // excludeInputInOutput
        testing::Values(true),        // returnContextLogits
        testing::Values(true),        // returnGenerationLogits
        testing::Values("gpt"),       // modelName
        testing::Values(false, true), // useOrchestratorMode
        testing::Values(false, true), // returnAllGeneratedTokens
        testing::Values(1, 2)         // numReturnSequences
        ),
    generateTestNameAllParams);

INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, AllParamsTest,
    testing::Combine(                                                                   //
        testing::Values(false, true),                                                   // streaming
        testing::Values(1, 2),                                                          // beamWidth
        testing::Values(true),                                                          // computeLogProbs
        testing::Values(false, true),                                                   // excludeInputInOutput
        testing::Values(false),                                                         // returnContextLogits
        testing::Values(true),                                                          // returnGenerationLogits
        testing::Values("llama_tp1_pp4_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1"), // modelName
        testing::Values(false, true),                                                   // useOrchestratorMode
        testing::Values(false),                                                         // returnAllGeneratedTokens
        testing::Values(1)                                                              // numReturnSequences
        ),
    generateTestNameAllParams);

INSTANTIATE_TEST_SUITE_P(LlamaMultiExecutorTest, AllParamsTest,
    testing::Combine(                         //
        testing::Values(false, true),         // streaming
        testing::Values(1, 2),                // beamWidth
        testing::Values(false),               // computeLogProbs
        testing::Values(false, true),         // excludeInputInOutput
        testing::Values(false),               // returnContextLogits
        testing::Values(false),               // returnGenerationLogits
        testing::Values("llama_tp1_pp2_cp1"), // modelName
        testing::Values(false),               // useOrchestratorMode
        testing::Values(false),               // returnAllGeneratedTokens
        testing::Values(1)                    // numReturnSequences
        ),
    generateTestNameAllParams);

INSTANTIATE_TEST_SUITE_P(MedusaExecutorTest, AllParamsTest,
    testing::Combine(                 //
        testing::Values(false, true), // streaming
        testing::Values(1),           // beamWidth
        testing::Values(false),       // computeLogProbs
        testing::Values(false, true), // excludeInputInOutput
        testing::Values(false),       // returnContextLogits
        testing::Values(false),       // returnGenerationLogits
        testing::Values("medusa"),    // modelName
        testing::Values(false, true), // useOrchestratorMode
        testing::Values(false),       // returnAllGeneratedTokens
        testing::Values(1)            // numReturnSequences
        ),
    generateTestNameAllParams);

// Disable some of ChatGLM's tests since they are the same as gpt's.
INSTANTIATE_TEST_SUITE_P(ChatGlmExecutorTest, AllParamsTest,
    testing::Combine(               //
        testing::Values(false),     // streaming
        testing::Values(1, 2),      // beamWidth
        testing::Values(false),     // computeLogProbs
        testing::Values(false),     // excludeInputInOutput
        testing::Values(false),     // returnContextLogits
        testing::Values(false),     // returnGenerationLogits
        testing::Values("chatglm"), // modelName
        testing::Values(false),     // useOrchestratorMode
        testing::Values(false),     // returnAllGeneratedTokens
        testing::Values(1, 2)       // numReturnSequences
        ),
    generateTestNameAllParams);

// ChatGlm0 Test is for glm-10b.
INSTANTIATE_TEST_SUITE_P(ChatGlm0ExecutorTest, AllParamsTest,
    testing::Combine(           //
        testing::Values(false), // streaming
        testing::Values(1),     // beamWidth
        testing::Values(false), // computeLogProbs
        testing::Values(false), // excludeInputInOutput
        testing::Values(false), // returnContextLogits
        testing::Values(false), // returnGenerationLogits
        testing::Values("glm"), // modelName
        testing::Values(false), // useOrchestratorMode
        testing::Values(false), // returnAllGeneratedTokens
        testing::Values(1)      // numReturnSequences
        ),
    generateTestNameAllParams);

INSTANTIATE_TEST_SUITE_P(ChatGlm2ExecutorTest, AllParamsTest,
    testing::Combine(                //
        testing::Values(false),      // streaming
        testing::Values(1),          // beamWidth
        testing::Values(false),      // computeLogProbs
        testing::Values(false),      // excludeInputInOutput
        testing::Values(false),      // returnContextLogits
        testing::Values(false),      // returnGenerationLogits
        testing::Values("chatglm2"), // modelName
        testing::Values(false),      // useOrchestratorMode
        testing::Values(false),      // returnAllGeneratedTokens
        testing::Values(1)           // numReturnSequences
        ),
    generateTestNameAllParams);

INSTANTIATE_TEST_SUITE_P(ChatGlm3ExecutorTest, AllParamsTest,
    testing::Combine(                //
        testing::Values(false),      // streaming
        testing::Values(1),          // beamWidth
        testing::Values(false),      // computeLogProbs
        testing::Values(false),      // excludeInputInOutput
        testing::Values(false),      // returnContextLogits
        testing::Values(false),      // returnGenerationLogits
        testing::Values("chatglm3"), // modelName
        testing::Values(false),      // useOrchestratorMode
        testing::Values(false),      // returnAllGeneratedTokens
        testing::Values(1)           // numReturnSequences
        ),
    generateTestNameAllParams);

INSTANTIATE_TEST_SUITE_P(LlamaExecutorTest, LogitsProcParamsTest,
    testing::Combine(                                                                            //
        testing::Values(
            "llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1"), // modelName
        testing::Values(false, true),                                                            // batched
        testing::Values(false, true)                                                             // replicated
        ),
    generateTestNameLogitsProc);

INSTANTIATE_TEST_SUITE_P(GptExecutorGuidedDecodingTest, GuidedDecodingParamsTest,
    testing::Combine(testing::Values("gpt")), generateTestNameGuidedDecoding);

INSTANTIATE_TEST_SUITE_P(LlamaExecutorGuidedDecodingTest, GuidedDecodingParamsTest,
    testing::Combine(
        testing::Values("llama_tp1_pp1_cp1", "llama_tp4_pp1_cp1", "llama_tp2_pp2_cp1", "llama_tp1_pp4_cp1")),
    generateTestNameGuidedDecoding);