mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
186 lines
7.2 KiB
C++
186 lines
7.2 KiB
C++
/*
|
|
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
#include "tensorrt_llm/common/memoryUtils.h"
|
|
#include "tensorrt_llm/runtime/bufferManager.h"
|
|
#include "tensorrt_llm/runtime/gptDecoder.h"
|
|
#include "tensorrt_llm/runtime/gptModelConfig.h"
|
|
#include "tensorrt_llm/runtime/worldConfig.h"
|
|
|
|
using namespace tensorrt_llm::runtime;
|
|
|
|
namespace tc = tensorrt_llm::common;
|
|
|
|
namespace
|
|
{
|
|
|
|
void testDecoder(nvinfer1::DataType const dtype, SamplingConfig const& samplingConfig)
|
|
{
|
|
SizeType constexpr tensorParallelism{1};
|
|
SizeType constexpr pipelineParallelism{1};
|
|
SizeType constexpr localRank{0};
|
|
WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank};
|
|
|
|
SizeType constexpr vocabSize{51200};
|
|
SizeType constexpr nbLayers{2};
|
|
SizeType constexpr nbHeads{16};
|
|
SizeType constexpr hiddenSize{1024};
|
|
GptModelConfig modelConfig{vocabSize, nbLayers, nbHeads, hiddenSize, dtype};
|
|
modelConfig.useGptAttentionPlugin(false);
|
|
|
|
auto streamPtr = std::make_shared<CudaStream>();
|
|
BufferManager manager(streamPtr);
|
|
|
|
// create decoder
|
|
auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
|
|
auto decoder = IGptDecoder::create(modelConfig.getDataType(), vocabSize, vocabSizePadded, streamPtr);
|
|
ASSERT_TRUE(static_cast<bool>(decoder));
|
|
|
|
// setup decoder
|
|
auto const beamWidth = samplingConfig.beamWidth;
|
|
SizeType constexpr batchSize{4};
|
|
|
|
SizeType constexpr maxInputLength{8};
|
|
SizeType constexpr maxNewTokens{2};
|
|
auto constexpr maxSeqLength = maxInputLength + maxNewTokens;
|
|
decoder->setup(samplingConfig, batchSize, maxSeqLength);
|
|
|
|
// set up inputs
|
|
auto logits = std::shared_ptr(
|
|
manager.gpu(ITensor::makeShape({batchSize, beamWidth, vocabSizePadded}), modelConfig.getDataType()));
|
|
manager.setZero(*logits);
|
|
|
|
int constexpr endId{50257};
|
|
std::vector<int> const endIdsVec(batchSize * beamWidth, endId);
|
|
auto endIds
|
|
= std::shared_ptr(manager.copyFrom(endIdsVec, ITensor::makeShape({batchSize, beamWidth}), MemoryType::kGPU));
|
|
|
|
DecodingInput inputs{maxInputLength, maxSeqLength, batchSize, logits, endIds};
|
|
std::vector<std::int32_t> sequenceLimitLengthsVec(batchSize, maxSeqLength);
|
|
inputs.sequenceLimitLength
|
|
= manager.copyFrom(sequenceLimitLengthsVec, ITensor::makeShape({batchSize}), MemoryType::kGPU);
|
|
|
|
if (beamWidth > 1)
|
|
{
|
|
auto srcCacheIndirection = std::shared_ptr(
|
|
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
|
|
manager.setZero(*srcCacheIndirection);
|
|
inputs.cacheIndirection = srcCacheIndirection;
|
|
}
|
|
|
|
// set up outputs
|
|
auto outputIds = std::shared_ptr(
|
|
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
|
|
manager.setZero(*outputIds);
|
|
DecodingOutput outputs{outputIds};
|
|
auto newTokens
|
|
= std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32));
|
|
manager.setZero(*newTokens);
|
|
outputs.newTokens = newTokens;
|
|
|
|
std::vector<int> sequenceLengthsVec(batchSize * beamWidth, maxInputLength);
|
|
outputs.lengths
|
|
= manager.copyFrom(sequenceLengthsVec, ITensor::makeShape({batchSize, beamWidth}), MemoryType::kGPU);
|
|
outputs.finished = manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kBOOL);
|
|
inputs.finished = ITensor::view(outputs.finished);
|
|
manager.setZero(*outputs.finished);
|
|
outputs.finishedSum = BufferManager::pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
|
|
auto* finishedSumHost = bufferCast<std::int32_t>(*outputs.finishedSum);
|
|
*finishedSumHost = -1;
|
|
|
|
if (beamWidth > 1)
|
|
{
|
|
auto tgtCacheIndirection = std::shared_ptr(
|
|
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
|
|
manager.setZero(*tgtCacheIndirection);
|
|
outputs.cacheIndirection = tgtCacheIndirection;
|
|
|
|
auto cumLogProbs
|
|
= std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kFLOAT));
|
|
manager.setZero(*cumLogProbs);
|
|
outputs.cumLogProbs = cumLogProbs;
|
|
|
|
auto parentIds = std::shared_ptr(
|
|
manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32));
|
|
manager.setZero(*parentIds);
|
|
outputs.parentIds = parentIds;
|
|
}
|
|
|
|
// run decoder
|
|
EXPECT_FALSE(decoder->forward(outputs, inputs));
|
|
inputs.step += 1;
|
|
EXPECT_EQ(*finishedSumHost, 0);
|
|
|
|
// verify results
|
|
auto outputsIdsHost = manager.copyFrom(*outputs.ids, MemoryType::kCPU);
|
|
auto output = bufferCast<std::int32_t>(*outputsIdsHost);
|
|
manager.getStream().synchronize();
|
|
|
|
for (auto b = 0; b < batchSize; ++b)
|
|
{
|
|
for (auto bw = 0; bw < beamWidth; ++bw)
|
|
{
|
|
auto const result = (beamWidth == 1) ? 1023 : bw;
|
|
|
|
bool anyMismatch = false;
|
|
for (auto i = 0; i < maxInputLength; ++i)
|
|
{
|
|
auto const outputIndex = tc::flat_index3(b, bw, i, beamWidth, maxSeqLength);
|
|
EXPECT_EQ(output[outputIndex], 0) << " b: " << b << " bw: " << bw << " i: " << i;
|
|
anyMismatch |= (output[outputIndex] != 0);
|
|
}
|
|
for (auto i = 0; i < maxNewTokens - 1; ++i)
|
|
{
|
|
auto const index = tc::flat_index3(b, bw, maxInputLength + i, beamWidth, maxSeqLength);
|
|
EXPECT_EQ(output[index], result) << " b: " << b << " bw: " << bw << " i: " << i;
|
|
anyMismatch |= (output[index] != result);
|
|
}
|
|
ASSERT_FALSE(anyMismatch);
|
|
}
|
|
}
|
|
|
|
// run decoder again
|
|
EXPECT_TRUE(decoder->forward(outputs, inputs));
|
|
EXPECT_EQ(*finishedSumHost, outputs.finished->getSize());
|
|
}
|
|
|
|
} // namespace
|
|
|
|
class ParamTest : public ::testing::TestWithParam<std::tuple<nvinfer1::DataType, SizeType>>
|
|
{
|
|
};
|
|
|
|
TEST_P(ParamTest, Test)
|
|
{
|
|
nvinfer1::DataType const dtype{std::get<0>(GetParam())};
|
|
SizeType const beamWidth{std::get<1>(GetParam())};
|
|
SamplingConfig const samplingConfig{beamWidth};
|
|
|
|
testDecoder(dtype, samplingConfig);
|
|
}
|
|
|
|
INSTANTIATE_TEST_SUITE_P(GptDecoderTest, ParamTest,
|
|
testing::Combine(testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kHALF), testing::Values(1, 3)),
|
|
[](const testing::TestParamInfo<ParamTest::ParamType>& info)
|
|
{
|
|
std::string name{std::get<0>(info.param) == nvinfer1::DataType::kFLOAT ? "Float" : "Half"};
|
|
auto const beamWidth = std::get<1>(info.param);
|
|
name.append(beamWidth == 1 ? "Sampling" : "BeamWidth" + std::to_string(beamWidth));
|
|
return name;
|
|
});
|