#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h" #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/rawEngine.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tests/utils/common.h" #include "tests/utils/engines.h" #include "tests/utils/executorUtils.h" #include #include "gtest/gtest.h" #include #include namespace tensorrt_llm::testing { struct TrivialConstantDecoderTestParameters { using TupleT = std::tuple; runtime::SizeType32 randomSeed; runtime::SizeType32 vocabSize; runtime::SizeType32 maxNumTokens; runtime::SizeType32 maxBeamWidth; runtime::SizeType32 maxBatchSize; runtime::SizeType32 numRequests; runtime::SizeType32 promptLength; runtime::SizeType32 maxOutputLength; // Constructor that takes a tuple TrivialConstantDecoderTestParameters( // NOLINT: implicit to allow gtest to convert from tuple generated by // 'combine' TupleT t) : randomSeed(std::get<0>(t)) , vocabSize(std::get<1>(t)) , maxNumTokens(std::get<2>(t)) , maxBeamWidth(std::get<3>(t)) , maxBatchSize(std::get<4>(t)) , numRequests(std::get<5>(t)) , promptLength(std::get<6>(t)) , maxOutputLength(std::get<7>(t)) { } }; template struct DecoderTestShared { static constexpr runtime::SizeType32 kNumTokensPerBlock = 64; static constexpr runtime::SizeType32 kKvCacheMaxTokens = 2048 * 8; DecoderTestShared(std::shared_ptr logger, std::mt19937 rng, std::shared_ptr executor, std::vector randomLogits) : logger(std::move(logger)) , rng(rng) , executor(std::move(executor)) , randomLogits(std::move(randomLogits)){}; std::shared_ptr logger; std::mt19937 rng; std::shared_ptr executor; std::vector randomLogits; }; template std::unique_ptr> SetupDecoderTest(TrivialConstantDecoderTestParameters const& params) { auto logger = std::make_shared(); auto rng = std::mt19937(params.randomSeed); auto randomLogits = tensorrt_llm::testing::randomLogits(params.vocabSize, &rng); auto const decoderParameters = tensorrt_llm::testing::utils::engines::ConstantTrivialDecoderParameters{ tensorrt_llm::testing::utils::engines::TrivialDecoderParameters{params.vocabSize, params.maxBatchSize, params.maxNumTokens, DecoderTestShared::kNumTokensPerBlock, params.maxBeamWidth, false}, randomLogits}; auto engineHostMemory = tensorrt_llm::testing::utils::engines::createConstantTrivialDecoder(decoderParameters, logger); auto const engine = runtime::RawEngine(engineHostMemory.release()); auto const dtype = runtime::TRTDataType::value; auto modelConfig = runtime::ModelConfig(params.vocabSize, 1, 1, 0, 1, 1, dtype); modelConfig.useGptAttentionPlugin(true); modelConfig.setModelVariant(runtime::ModelConfig::ModelVariant::kGpt); modelConfig.usePackedInput(true); modelConfig.setKVCacheType(runtime::ModelConfig::KVCacheType::kPAGED); modelConfig.setMaxNumTokens(params.maxNumTokens); modelConfig.setMaxBatchSize(params.maxBatchSize); modelConfig.setMaxBeamWidth(params.maxBeamWidth); modelConfig.setMaxSequenceLen(params.maxNumTokens); modelConfig.setMaxInputLen(params.maxNumTokens); modelConfig.setLayerTypes({runtime::ModelConfig::LayerType::kATTENTION}); modelConfig.setTokensPerBlock(DecoderTestShared::kNumTokensPerBlock); modelConfig.setPagedContextFMHA(true); auto const worldConfig = runtime::WorldConfig(); auto optionalParams = batch_manager::TrtGptModelOptionalParams{}; auto kvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig{}; kvCacheConfig.maxTokens = DecoderTestShared::kKvCacheMaxTokens; optionalParams.kvCacheConfig = kvCacheConfig; auto model = std::make_shared( logger, modelConfig, worldConfig, engine, false, optionalParams); auto const executorConfig = tensorrt_llm::executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(), executor::KvCacheConfig{}, true, true, 1, 1, executor::BatchingType::kINFLIGHT, params.maxBatchSize, params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, 1, std::nullopt, executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0, executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt); return std::make_unique>( logger, rng, std::make_shared(model, executorConfig), randomLogits); } template class DecoderTest : public ::testing::Test, public ::testing::WithParamInterface { protected: std::unique_ptr> state; DecoderTest() { auto const params = GetParam(); state = SetupDecoderTest(params); } void runDecoderTest(TrivialConstantDecoderTestParameters const& parameters) { auto const requestTokens = createConsecutiveTokenSequence(parameters.promptLength, parameters.vocabSize, 0); auto requests = std::vector{}; requests.reserve(static_cast(parameters.numRequests)); for (auto i = 0; i < parameters.numRequests; i++) { requests.emplace_back(requestTokens, parameters.maxOutputLength, false, executor::SamplingConfig{}, executor::OutputConfig{false, false, false, true, false, false}); } auto const accumulatedResponses = runThroughRequests(*state->executor, requests, std::chrono::duration(3600000)); ASSERT_EQ(accumulatedResponses.size(), parameters.numRequests); std::sort(state->randomLogits.begin(), state->randomLogits.end()); std::reverse(state->randomLogits.begin(), state->randomLogits.end()); for (auto const& [requestId, responses] : accumulatedResponses) { for (auto const& response : responses) { ASSERT_FALSE(response.hasError()); auto const& tokensByBeam = response.getResult().outputTokenIds; ASSERT_EQ(tokensByBeam.size(), 1); for (auto const& tokensForBeam : tokensByBeam) { ASSERT_EQ(tokensForBeam.size(), parameters.maxOutputLength); } } } } }; namespace { constexpr runtime::SizeType32 kRandomSeed1 = 45; auto const randomSeeds = ::testing::Values(kRandomSeed1); constexpr runtime::SizeType32 kMinVocabSize = 16; auto const vocabSizes = ::testing::Values(kMinVocabSize); constexpr runtime::SizeType32 kMinMaxNumTokens = 2048; auto const maxNumTokenses = ::testing::Values(kMinMaxNumTokens); constexpr runtime::SizeType32 kMinBeamWidth = 1; auto const beamWidths = ::testing::Values(kMinBeamWidth); constexpr runtime::SizeType32 kMinMaxBatchSize = 2048; auto const maxBatchSizes = ::testing::Values(kMinMaxBatchSize); constexpr runtime::SizeType32 kMinNumRequests = 64; auto const numRequestses = ::testing::Values(kMinNumRequests); constexpr runtime::SizeType32 kMinPromptLength = 32; auto const promptLengths = ::testing::Values(kMinPromptLength); constexpr runtime::SizeType32 kMinMaxOutputLength = 16; auto const maxOutputLengths = ::testing::Values(kMinMaxOutputLength); auto const paramGenerator = ::testing::ConvertGenerator(::testing::Combine(randomSeeds, vocabSizes, maxNumTokenses, beamWidths, maxBatchSizes, numRequestses, promptLengths, maxOutputLengths)); } // namespace using DecoderFloatTest = DecoderTest; TEST_P(DecoderFloatTest, TestSizeAndValues) { runDecoderTest(GetParam()); } INSTANTIATE_TEST_SUITE_P(Float, DecoderFloatTest, paramGenerator, [](::testing::TestParamInfo const& info) -> std::string { std::stringstream nameStringStream; nameStringStream << "_maxBatchSize_" << info.param.maxBatchSize << "_vocabSize_" << info.param.vocabSize << "_maxBeamWidth_" << info.param.maxBeamWidth << "_maxNumTokens_" << info.param.maxNumTokens << "_maxOutputLength_" << info.param.maxOutputLength << "_numRequests_" << info.param.numRequests << "_promptLength_" << info.param.promptLength << "_randomSeed_" << info.param.randomSeed; return nameStringStream.str(); }); } // namespace tensorrt_llm::testing