TensorRT-LLMs/cpp/tests/unit_tests/executor/executorTestSmall.cpp
Dom Brown dbd9a83b0d
feat: Integrate GPUDirect Storage (GDS) into Executor API (#3582)
* feat: Integrate GPUDirect Storage (GDS) into Executor API

Squash of several dev commits

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
2025-04-18 15:59:21 +08:00

206 lines
9.0 KiB
C++

#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/rawEngine.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include "tests/utils/common.h"
#include "tests/utils/engines.h"
#include "tests/utils/executorUtils.h"
#include <tensorrt_llm/batch_manager/trtGptModelInflightBatching.h>
#include "gtest/gtest.h"
#include <random>
#include <tuple>
namespace tensorrt_llm::testing
{
struct TrivialConstantDecoderTestParameters
{
using TupleT = std::tuple<runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, runtime::SizeType32,
runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, runtime::SizeType32>;
runtime::SizeType32 randomSeed;
runtime::SizeType32 vocabSize;
runtime::SizeType32 maxNumTokens;
runtime::SizeType32 maxBeamWidth;
runtime::SizeType32 maxBatchSize;
runtime::SizeType32 numRequests;
runtime::SizeType32 promptLength;
runtime::SizeType32 maxOutputLength;
// Constructor that takes a tuple
TrivialConstantDecoderTestParameters( // NOLINT: implicit to allow gtest to convert from tuple generated by
// 'combine'
TupleT t)
: randomSeed(std::get<0>(t))
, vocabSize(std::get<1>(t))
, maxNumTokens(std::get<2>(t))
, maxBeamWidth(std::get<3>(t))
, maxBatchSize(std::get<4>(t))
, numRequests(std::get<5>(t))
, promptLength(std::get<6>(t))
, maxOutputLength(std::get<7>(t))
{
}
};
template <typename TLogits>
struct DecoderTestShared
{
static constexpr runtime::SizeType32 kNumTokensPerBlock = 64;
static constexpr runtime::SizeType32 kKvCacheMaxTokens = 2048 * 8;
DecoderTestShared(std::shared_ptr<runtime::TllmLogger> logger, std::mt19937 rng,
std::shared_ptr<executor::Executor> executor, std::vector<TLogits> randomLogits)
: logger(std::move(logger))
, rng(rng)
, executor(std::move(executor))
, randomLogits(std::move(randomLogits)){};
std::shared_ptr<runtime::TllmLogger> logger;
std::mt19937 rng;
std::shared_ptr<executor::Executor> executor;
std::vector<TLogits> randomLogits;
};
template <typename TLogits>
std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(TrivialConstantDecoderTestParameters const& params)
{
auto logger = std::make_shared<runtime::TllmLogger>();
auto rng = std::mt19937(params.randomSeed);
auto randomLogits = tensorrt_llm::testing::randomLogits<std::mt19937, TLogits>(params.vocabSize, &rng);
auto const decoderParameters = tensorrt_llm::testing::utils::engines::ConstantTrivialDecoderParameters<TLogits>{
tensorrt_llm::testing::utils::engines::TrivialDecoderParameters{params.vocabSize, params.maxBatchSize,
params.maxNumTokens, DecoderTestShared<TLogits>::kNumTokensPerBlock, params.maxBeamWidth, false},
randomLogits};
auto engineHostMemory
= tensorrt_llm::testing::utils::engines::createConstantTrivialDecoder<TLogits>(decoderParameters, logger);
auto const engine = runtime::RawEngine(engineHostMemory.release());
auto const dtype = runtime::TRTDataType<TLogits>::value;
auto modelConfig = runtime::ModelConfig(params.vocabSize, 1, 1, 0, 1, 1, dtype);
modelConfig.useGptAttentionPlugin(true);
modelConfig.setModelVariant(runtime::ModelConfig::ModelVariant::kGpt);
modelConfig.usePackedInput(true);
modelConfig.setKVCacheType(runtime::ModelConfig::KVCacheType::kPAGED);
modelConfig.setMaxNumTokens(params.maxNumTokens);
modelConfig.setMaxBatchSize(params.maxBatchSize);
modelConfig.setMaxBeamWidth(params.maxBeamWidth);
modelConfig.setMaxSequenceLen(params.maxNumTokens);
modelConfig.setMaxInputLen(params.maxNumTokens);
modelConfig.setLayerTypes({runtime::ModelConfig::LayerType::kATTENTION});
modelConfig.setTokensPerBlock(DecoderTestShared<TLogits>::kNumTokensPerBlock);
modelConfig.setPagedContextFMHA(true);
auto const worldConfig = runtime::WorldConfig();
auto optionalParams = batch_manager::TrtGptModelOptionalParams{};
auto kvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig{};
kvCacheConfig.maxTokens = DecoderTestShared<TLogits>::kKvCacheMaxTokens;
optionalParams.kvCacheConfig = kvCacheConfig;
auto model = std::make_shared<batch_manager::TrtGptModelInflightBatching>(
logger, modelConfig, worldConfig, engine, false, optionalParams);
auto const executorConfig = tensorrt_llm::executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(),
executor::KvCacheConfig{}, true, true, 1, 1, executor::BatchingType::kINFLIGHT, params.maxBatchSize,
params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, 1, std::nullopt,
executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt);
return std::make_unique<DecoderTestShared<TLogits>>(
logger, rng, std::make_shared<executor::Executor>(model, executorConfig), randomLogits);
}
template <typename TLogits>
class DecoderTest : public ::testing::Test, public ::testing::WithParamInterface<TrivialConstantDecoderTestParameters>
{
protected:
std::unique_ptr<DecoderTestShared<TLogits>> state;
DecoderTest()
{
auto const params = GetParam();
state = SetupDecoderTest<TLogits>(params);
}
void runDecoderTest(TrivialConstantDecoderTestParameters const& parameters)
{
auto const requestTokens = createConsecutiveTokenSequence(parameters.promptLength, parameters.vocabSize, 0);
auto requests = std::vector<executor::Request>{};
requests.reserve(static_cast<std::size_t>(parameters.numRequests));
for (auto i = 0; i < parameters.numRequests; i++)
{
requests.emplace_back(requestTokens, parameters.maxOutputLength, false, executor::SamplingConfig{},
executor::OutputConfig{false, false, false, true, false, false});
}
auto const accumulatedResponses
= runThroughRequests(*state->executor, requests, std::chrono::duration<float, std::milli>(3600000));
ASSERT_EQ(accumulatedResponses.size(), parameters.numRequests);
std::sort(state->randomLogits.begin(), state->randomLogits.end());
std::reverse(state->randomLogits.begin(), state->randomLogits.end());
for (auto const& [requestId, responses] : accumulatedResponses)
{
for (auto const& response : responses)
{
ASSERT_FALSE(response.hasError());
auto const& tokensByBeam = response.getResult().outputTokenIds;
ASSERT_EQ(tokensByBeam.size(), 1);
for (auto const& tokensForBeam : tokensByBeam)
{
ASSERT_EQ(tokensForBeam.size(), parameters.maxOutputLength);
}
}
}
}
};
namespace
{
constexpr runtime::SizeType32 kRandomSeed1 = 45;
auto const randomSeeds = ::testing::Values(kRandomSeed1);
constexpr runtime::SizeType32 kMinVocabSize = 16;
auto const vocabSizes = ::testing::Values(kMinVocabSize);
constexpr runtime::SizeType32 kMinMaxNumTokens = 2048;
auto const maxNumTokenses = ::testing::Values(kMinMaxNumTokens);
constexpr runtime::SizeType32 kMinBeamWidth = 1;
auto const beamWidths = ::testing::Values(kMinBeamWidth);
constexpr runtime::SizeType32 kMinMaxBatchSize = 2048;
auto const maxBatchSizes = ::testing::Values(kMinMaxBatchSize);
constexpr runtime::SizeType32 kMinNumRequests = 64;
auto const numRequestses = ::testing::Values(kMinNumRequests);
constexpr runtime::SizeType32 kMinPromptLength = 32;
auto const promptLengths = ::testing::Values(kMinPromptLength);
constexpr runtime::SizeType32 kMinMaxOutputLength = 16;
auto const maxOutputLengths = ::testing::Values(kMinMaxOutputLength);
auto const paramGenerator
= ::testing::ConvertGenerator<TrivialConstantDecoderTestParameters::TupleT>(::testing::Combine(randomSeeds,
vocabSizes, maxNumTokenses, beamWidths, maxBatchSizes, numRequestses, promptLengths, maxOutputLengths));
} // namespace
using DecoderFloatTest = DecoderTest<float>;
TEST_P(DecoderFloatTest, TestSizeAndValues)
{
runDecoderTest(GetParam());
}
INSTANTIATE_TEST_SUITE_P(Float, DecoderFloatTest, paramGenerator,
[](::testing::TestParamInfo<TrivialConstantDecoderTestParameters> const& info) -> std::string
{
std::stringstream nameStringStream;
nameStringStream << "_maxBatchSize_" << info.param.maxBatchSize << "_vocabSize_" << info.param.vocabSize
<< "_maxBeamWidth_" << info.param.maxBeamWidth << "_maxNumTokens_" << info.param.maxNumTokens
<< "_maxOutputLength_" << info.param.maxOutputLength << "_numRequests_"
<< info.param.numRequests << "_promptLength_" << info.param.promptLength << "_randomSeed_"
<< info.param.randomSeed;
return nameStringStream.str();
});
} // namespace tensorrt_llm::testing