TensorRT-LLMs/cpp/tests/runtime/gptDecoderTest.cpp
Kaiyu Xie 711a28d9bf
Update TensorRT-LLM (#465)
* Update TensorRT-LLM

---------

Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
2023-11-24 22:12:26 +08:00

186 lines
7.2 KiB
C++

/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/gptDecoder.h"
#include "tensorrt_llm/runtime/gptModelConfig.h"
#include "tensorrt_llm/runtime/worldConfig.h"
using namespace tensorrt_llm::runtime;
namespace tc = tensorrt_llm::common;
namespace
{
void testDecoder(nvinfer1::DataType const dtype, SamplingConfig const& samplingConfig)
{
SizeType constexpr tensorParallelism{1};
SizeType constexpr pipelineParallelism{1};
SizeType constexpr localRank{0};
WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank};
SizeType constexpr vocabSize{51200};
SizeType constexpr nbLayers{2};
SizeType constexpr nbHeads{16};
SizeType constexpr hiddenSize{1024};
GptModelConfig modelConfig{vocabSize, nbLayers, nbHeads, hiddenSize, dtype};
modelConfig.useGptAttentionPlugin(false);
auto streamPtr = std::make_shared<CudaStream>();
BufferManager manager(streamPtr);
// create decoder
auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
auto decoder = IGptDecoder::create(modelConfig.getDataType(), vocabSize, vocabSizePadded, streamPtr);
ASSERT_TRUE(static_cast<bool>(decoder));
// setup decoder
auto const beamWidth = samplingConfig.beamWidth;
SizeType constexpr batchSize{4};
SizeType constexpr maxInputLength{8};
SizeType constexpr maxNewTokens{2};
auto constexpr maxSeqLength = maxInputLength + maxNewTokens;
decoder->setup(samplingConfig, batchSize, maxSeqLength);
// set up inputs
auto logits = std::shared_ptr(
manager.gpu(ITensor::makeShape({batchSize, beamWidth, vocabSizePadded}), modelConfig.getDataType()));
manager.setZero(*logits);
int constexpr endId{50257};
std::vector<int> const endIdsVec(batchSize * beamWidth, endId);
auto endIds
= std::shared_ptr(manager.copyFrom(endIdsVec, ITensor::makeShape({batchSize, beamWidth}), MemoryType::kGPU));
DecodingInput inputs{maxInputLength, maxSeqLength, batchSize, logits, endIds};
std::vector<std::int32_t> sequenceLimitLengthsVec(batchSize, maxSeqLength);
inputs.sequenceLimitLength
= manager.copyFrom(sequenceLimitLengthsVec, ITensor::makeShape({batchSize}), MemoryType::kGPU);
if (beamWidth > 1)
{
auto srcCacheIndirection = std::shared_ptr(
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
manager.setZero(*srcCacheIndirection);
inputs.cacheIndirection = srcCacheIndirection;
}
// set up outputs
auto outputIds = std::shared_ptr(
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
manager.setZero(*outputIds);
DecodingOutput outputs{outputIds};
auto newTokens
= std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32));
manager.setZero(*newTokens);
outputs.newTokens = newTokens;
std::vector<int> sequenceLengthsVec(batchSize * beamWidth, maxInputLength);
outputs.lengths
= manager.copyFrom(sequenceLengthsVec, ITensor::makeShape({batchSize, beamWidth}), MemoryType::kGPU);
outputs.finished = manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kBOOL);
inputs.finished = ITensor::view(outputs.finished);
manager.setZero(*outputs.finished);
outputs.finishedSum = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
auto* finishedSumHost = bufferCast<std::int32_t>(*outputs.finishedSum);
*finishedSumHost = -1;
if (beamWidth > 1)
{
auto tgtCacheIndirection = std::shared_ptr(
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
manager.setZero(*tgtCacheIndirection);
outputs.cacheIndirection = tgtCacheIndirection;
auto cumLogProbs
= std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kFLOAT));
manager.setZero(*cumLogProbs);
outputs.cumLogProbs = cumLogProbs;
auto parentIds = std::shared_ptr(
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
manager.setZero(*parentIds);
outputs.parentIds = parentIds;
}
// run decoder
EXPECT_FALSE(decoder->forward(outputs, inputs));
inputs.step += 1;
EXPECT_EQ(*finishedSumHost, 0);
// verify results
auto outputsIdsHost = manager.copyFrom(*outputs.ids, MemoryType::kCPU);
auto output = bufferCast<std::int32_t>(*outputsIdsHost);
manager.getStream().synchronize();
for (auto b = 0; b < batchSize; ++b)
{
for (auto bw = 0; bw < beamWidth; ++bw)
{
auto const result = (beamWidth == 1) ? 1023 : bw;
bool anyMismatch = false;
for (auto i = 0; i < maxInputLength; ++i)
{
auto const outputIndex = tc::flat_index3(b, bw, i, beamWidth, maxSeqLength);
EXPECT_EQ(output[outputIndex], 0) << " b: " << b << " bw: " << bw << " i: " << i;
anyMismatch |= (output[outputIndex] != 0);
}
for (auto i = 0; i < maxNewTokens - 1; ++i)
{
auto const index = tc::flat_index3(b, bw, maxInputLength + i, beamWidth, maxSeqLength);
EXPECT_EQ(output[index], result) << " b: " << b << " bw: " << bw << " i: " << i;
anyMismatch |= (output[index] != result);
}
ASSERT_FALSE(anyMismatch);
}
}
// run decoder again
EXPECT_TRUE(decoder->forward(outputs, inputs));
EXPECT_EQ(*finishedSumHost, outputs.finished->getSize());
}
} // namespace
class ParamTest : public ::testing::TestWithParam<std::tuple<nvinfer1::DataType, SizeType>>
{
};
TEST_P(ParamTest, Test)
{
nvinfer1::DataType const dtype{std::get<0>(GetParam())};
SizeType const beamWidth{std::get<1>(GetParam())};
SamplingConfig const samplingConfig{beamWidth};
testDecoder(dtype, samplingConfig);
}
INSTANTIATE_TEST_SUITE_P(GptDecoderTest, ParamTest,
testing::Combine(testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kHALF), testing::Values(1, 3)),
[](const testing::TestParamInfo<ParamTest::ParamType>& info)
{
std::string name{std::get<0>(info.param) == nvinfer1::DataType::kFLOAT ? "Float" : "Half"};
auto const beamWidth = std::get<1>(info.param);
name.append(beamWidth == 1 ? "Sampling" : "BeamWidth" + std::to_string(beamWidth));
return name;
});