mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* refactor: Update ExecutorConfig to use AdditionalModelOutput type - Changed function signatures and member variables across multiple files to replace std::optional<std::vector<std::string>> with std::optional<std::vector<executor::AdditionalModelOutput>> to include gatherContext flag for each additional output. - Updated related serialization and deserialization methods to accommodate the new type. - Adjusted tests to reflect the changes in the output handling structure. This refactor enhances the flexibility and maintainability of the output configuration in the executor and batch manager components. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Remove equality operator from TrtGptModelOptionalParams - Deleted the operator== implementation from TrtGptModelOptionalParams to simplify the class. - Updated the pybind11 bindings to remove the exposure of the equality operator to Python. This change streamlines the class definition and reduces unnecessary complexity in the bindings. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Enhance copyAdditionalOutputs to utilize AdditionalModelOutput - Updated the copyAdditionalOutputs function to accept a vector of AdditionalModelOutput, allowing for the inclusion of the gatherContext flag. - Adjusted the logic to handle context and non-context outputs separately, improving the output handling mechanism. - Modified related unit tests to incorporate the new gatherContext parameter, ensuring comprehensive testing of the updated functionality. This refactor improves the flexibility and clarity of output management in the batch processing workflow. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Introduce findOutputTensor utility function for output tensor retrieval - Added a new utility function, findOutputTensor, to encapsulate the logic for finding output tensors and checking their validity. - Refactored copyAdditionalOutputs to utilize findOutputTensor, reducing code duplication and improving clarity. - Enhanced error checking for additional context and generation output tensors. This change streamlines the output tensor retrieval process, enhancing maintainability and readability in the batch processing workflow. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Check final indices of additional output tensors and update tests - Added checks to verify the final indices of additional output tensors for context and generation outputs. - Updated unit tests to verify the changes. - Add lastTokenIds input tensor to test engines. - Logits output depends on gatherContextLogits parameter. - Removed gatherContextOutputs parameter from the validate method in LlmRequest. - Context outputs do not depend on computeContextLogits parameter. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * fixup! refactor: Check final indices of additional output tensors and update tests Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * fixup! refactor: Update ExecutorConfig to use AdditionalModelOutput type Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * fixup! refactor: Remove equality operator from TrtGptModelOptionalParams Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * docs: Update executor.md Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * chore: Clean up includes Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --------- Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
206 lines
9.0 KiB
C++
206 lines
9.0 KiB
C++
#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
|
|
#include "tensorrt_llm/executor/executor.h"
|
|
#include "tensorrt_llm/runtime/common.h"
|
|
#include "tensorrt_llm/runtime/rawEngine.h"
|
|
#include "tensorrt_llm/runtime/tllmLogger.h"
|
|
#include "tests/utils/common.h"
|
|
#include "tests/utils/engines.h"
|
|
#include "tests/utils/executorUtils.h"
|
|
#include <tensorrt_llm/batch_manager/trtGptModelInflightBatching.h>
|
|
|
|
#include "gtest/gtest.h"
|
|
|
|
#include <random>
|
|
#include <tuple>
|
|
|
|
namespace tensorrt_llm::testing
|
|
{
|
|
|
|
struct TrivialConstantDecoderTestParameters
|
|
{
|
|
using TupleT = std::tuple<runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, runtime::SizeType32,
|
|
runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, runtime::SizeType32>;
|
|
runtime::SizeType32 randomSeed;
|
|
runtime::SizeType32 vocabSize;
|
|
runtime::SizeType32 maxNumTokens;
|
|
runtime::SizeType32 maxBeamWidth;
|
|
runtime::SizeType32 maxBatchSize;
|
|
runtime::SizeType32 numRequests;
|
|
runtime::SizeType32 promptLength;
|
|
runtime::SizeType32 maxOutputLength;
|
|
|
|
// Constructor that takes a tuple
|
|
TrivialConstantDecoderTestParameters( // NOLINT: implicit to allow gtest to convert from tuple generated by
|
|
// 'combine'
|
|
TupleT t)
|
|
: randomSeed(std::get<0>(t))
|
|
, vocabSize(std::get<1>(t))
|
|
, maxNumTokens(std::get<2>(t))
|
|
, maxBeamWidth(std::get<3>(t))
|
|
, maxBatchSize(std::get<4>(t))
|
|
, numRequests(std::get<5>(t))
|
|
, promptLength(std::get<6>(t))
|
|
, maxOutputLength(std::get<7>(t))
|
|
{
|
|
}
|
|
};
|
|
|
|
template <typename TLogits>
|
|
struct DecoderTestShared
|
|
{
|
|
static constexpr runtime::SizeType32 kNumTokensPerBlock = 64;
|
|
static constexpr runtime::SizeType32 kKvCacheMaxTokens = 2048 * 8;
|
|
|
|
DecoderTestShared(std::shared_ptr<runtime::TllmLogger> logger, std::mt19937 rng,
|
|
std::shared_ptr<executor::Executor> executor, std::vector<TLogits> randomLogits)
|
|
: logger(std::move(logger))
|
|
, rng(rng)
|
|
, executor(std::move(executor))
|
|
, randomLogits(std::move(randomLogits)){};
|
|
std::shared_ptr<runtime::TllmLogger> logger;
|
|
std::mt19937 rng;
|
|
std::shared_ptr<executor::Executor> executor;
|
|
std::vector<TLogits> randomLogits;
|
|
};
|
|
|
|
template <typename TLogits>
|
|
std::unique_ptr<DecoderTestShared<TLogits>> SetupDecoderTest(TrivialConstantDecoderTestParameters const& params)
|
|
{
|
|
auto logger = std::make_shared<runtime::TllmLogger>();
|
|
auto rng = std::mt19937(params.randomSeed);
|
|
auto randomLogits = tensorrt_llm::testing::randomLogits<std::mt19937, TLogits>(params.vocabSize, &rng);
|
|
auto const decoderParameters = tensorrt_llm::testing::utils::engines::ConstantTrivialDecoderParameters<TLogits>{
|
|
tensorrt_llm::testing::utils::engines::TrivialDecoderParameters{params.vocabSize, params.maxBatchSize,
|
|
params.maxNumTokens, DecoderTestShared<TLogits>::kNumTokensPerBlock, params.maxBeamWidth, false},
|
|
randomLogits};
|
|
auto engineHostMemory
|
|
= tensorrt_llm::testing::utils::engines::createConstantTrivialDecoder<TLogits>(decoderParameters, logger);
|
|
auto const engine = runtime::RawEngine(engineHostMemory.release());
|
|
auto const dtype = runtime::TRTDataType<TLogits>::value;
|
|
auto modelConfig = runtime::ModelConfig(params.vocabSize, 1, 1, 0, 1, 1, dtype);
|
|
modelConfig.useGptAttentionPlugin(true);
|
|
modelConfig.setModelVariant(runtime::ModelConfig::ModelVariant::kGpt);
|
|
modelConfig.usePackedInput(true);
|
|
modelConfig.setKVCacheType(runtime::ModelConfig::KVCacheType::kPAGED);
|
|
modelConfig.setMaxNumTokens(params.maxNumTokens);
|
|
modelConfig.setMaxBatchSize(params.maxBatchSize);
|
|
modelConfig.setMaxBeamWidth(params.maxBeamWidth);
|
|
modelConfig.setMaxSequenceLen(params.maxNumTokens);
|
|
modelConfig.setMaxInputLen(params.maxNumTokens);
|
|
modelConfig.setLayerTypes({runtime::ModelConfig::LayerType::kATTENTION});
|
|
modelConfig.setTokensPerBlock(DecoderTestShared<TLogits>::kNumTokensPerBlock);
|
|
modelConfig.setPagedContextFMHA(true);
|
|
|
|
auto const worldConfig = runtime::WorldConfig();
|
|
auto optionalParams = batch_manager::TrtGptModelOptionalParams{};
|
|
auto kvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig{};
|
|
|
|
kvCacheConfig.maxTokens = DecoderTestShared<TLogits>::kKvCacheMaxTokens;
|
|
optionalParams.kvCacheConfig = kvCacheConfig;
|
|
auto model = std::make_shared<batch_manager::TrtGptModelInflightBatching>(
|
|
logger, modelConfig, worldConfig, engine, false, optionalParams);
|
|
auto const executorConfig = tensorrt_llm::executor::ExecutorConfig(params.maxBeamWidth, executor::SchedulerConfig(),
|
|
executor::KvCacheConfig{}, true, true, 1, 1, executor::BatchingType::kINFLIGHT, params.maxBatchSize,
|
|
params.maxNumTokens, std::nullopt, std::nullopt, std::nullopt, std::nullopt, 1, std::nullopt,
|
|
executor::ExtendedRuntimePerfKnobConfig(), std::nullopt, 0,
|
|
executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds, std::nullopt, std::nullopt);
|
|
return std::make_unique<DecoderTestShared<TLogits>>(
|
|
logger, rng, std::make_shared<executor::Executor>(model, executorConfig), randomLogits);
|
|
}
|
|
|
|
template <typename TLogits>
|
|
class DecoderTest : public ::testing::Test, public ::testing::WithParamInterface<TrivialConstantDecoderTestParameters>
|
|
{
|
|
protected:
|
|
std::unique_ptr<DecoderTestShared<TLogits>> state;
|
|
|
|
DecoderTest()
|
|
{
|
|
auto const params = GetParam();
|
|
state = SetupDecoderTest<TLogits>(params);
|
|
}
|
|
|
|
void runDecoderTest(TrivialConstantDecoderTestParameters const& parameters)
|
|
{
|
|
auto const requestTokens = createConsecutiveTokenSequence(parameters.promptLength, parameters.vocabSize, 0);
|
|
auto requests = std::vector<executor::Request>{};
|
|
requests.reserve(static_cast<std::size_t>(parameters.numRequests));
|
|
for (auto i = 0; i < parameters.numRequests; i++)
|
|
{
|
|
requests.emplace_back(requestTokens, parameters.maxOutputLength, false, executor::SamplingConfig{},
|
|
executor::OutputConfig{false, false, false, true, false, false});
|
|
}
|
|
auto const accumulatedResponses
|
|
= runThroughRequests(*state->executor, requests, std::chrono::duration<float, std::milli>(3600000));
|
|
ASSERT_EQ(accumulatedResponses.size(), parameters.numRequests);
|
|
|
|
std::sort(state->randomLogits.begin(), state->randomLogits.end());
|
|
std::reverse(state->randomLogits.begin(), state->randomLogits.end());
|
|
for (auto const& [requestId, responses] : accumulatedResponses)
|
|
{
|
|
for (auto const& response : responses)
|
|
{
|
|
ASSERT_FALSE(response.hasError());
|
|
auto const& tokensByBeam = response.getResult().outputTokenIds;
|
|
ASSERT_EQ(tokensByBeam.size(), 1);
|
|
for (auto const& tokensForBeam : tokensByBeam)
|
|
{
|
|
ASSERT_EQ(tokensForBeam.size(), parameters.maxOutputLength);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
namespace
|
|
{
|
|
constexpr runtime::SizeType32 kRandomSeed1 = 45;
|
|
auto const randomSeeds = ::testing::Values(kRandomSeed1);
|
|
|
|
constexpr runtime::SizeType32 kMinVocabSize = 16;
|
|
auto const vocabSizes = ::testing::Values(kMinVocabSize);
|
|
|
|
constexpr runtime::SizeType32 kMinMaxNumTokens = 2048;
|
|
auto const maxNumTokenses = ::testing::Values(kMinMaxNumTokens);
|
|
|
|
constexpr runtime::SizeType32 kMinBeamWidth = 1;
|
|
auto const beamWidths = ::testing::Values(kMinBeamWidth);
|
|
|
|
constexpr runtime::SizeType32 kMinMaxBatchSize = 2048;
|
|
auto const maxBatchSizes = ::testing::Values(kMinMaxBatchSize);
|
|
|
|
constexpr runtime::SizeType32 kMinNumRequests = 64;
|
|
auto const numRequestses = ::testing::Values(kMinNumRequests);
|
|
|
|
constexpr runtime::SizeType32 kMinPromptLength = 32;
|
|
auto const promptLengths = ::testing::Values(kMinPromptLength);
|
|
|
|
constexpr runtime::SizeType32 kMinMaxOutputLength = 16;
|
|
auto const maxOutputLengths = ::testing::Values(kMinMaxOutputLength);
|
|
|
|
auto const paramGenerator
|
|
= ::testing::ConvertGenerator<TrivialConstantDecoderTestParameters::TupleT>(::testing::Combine(randomSeeds,
|
|
vocabSizes, maxNumTokenses, beamWidths, maxBatchSizes, numRequestses, promptLengths, maxOutputLengths));
|
|
} // namespace
|
|
|
|
using DecoderFloatTest = DecoderTest<float>;
|
|
|
|
TEST_P(DecoderFloatTest, TestSizeAndValues)
|
|
{
|
|
runDecoderTest(GetParam());
|
|
}
|
|
|
|
INSTANTIATE_TEST_SUITE_P(Float, DecoderFloatTest, paramGenerator,
|
|
[](::testing::TestParamInfo<TrivialConstantDecoderTestParameters> const& info) -> std::string
|
|
{
|
|
std::stringstream nameStringStream;
|
|
nameStringStream << "_maxBatchSize_" << info.param.maxBatchSize << "_vocabSize_" << info.param.vocabSize
|
|
<< "_maxBeamWidth_" << info.param.maxBeamWidth << "_maxNumTokens_" << info.param.maxNumTokens
|
|
<< "_maxOutputLength_" << info.param.maxOutputLength << "_numRequests_"
|
|
<< info.param.numRequests << "_promptLength_" << info.param.promptLength << "_randomSeed_"
|
|
<< info.param.randomSeed;
|
|
return nameStringStream.str();
|
|
});
|
|
|
|
} // namespace tensorrt_llm::testing
|