/* * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/gptDecoderBatch.h" #include "tensorrt_llm/runtime/gptModelConfig.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/worldConfig.h" using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; namespace { void verifyResults(BufferManager& manager, GptDecoderBatch const& decoder, std::vector const& samplingConfigs, std::vector const& inputLengths, SizeType batchSize, SizeType maxBeamWidth, SizeType maxSeqLength, SizeType nbNewTokens, int tokenId, int padId) { auto sequenceLengths = decoder.getOutputLengths(); ASSERT_TRUE(sequenceLengths); EXPECT_EQ(sequenceLengths->getSize(), batchSize * maxBeamWidth); auto sequenceLengthsHost = manager.copyFrom(*sequenceLengths, MemoryType::kCPU); auto sequenceLengthsPtr = bufferCast(*sequenceLengthsHost); manager.getStream().synchronize(); for (auto b = 0; b < batchSize; ++b) { auto samplingConfig = samplingConfigs[b]; for (auto bw = 0; bw < samplingConfig.beamWidth; ++bw) { auto index = tc::flat_index(sequenceLengths->getShape().d, b, bw); EXPECT_EQ(sequenceLengthsPtr[index], inputLengths[b] + nbNewTokens); } } auto outputsIds = decoder.getOutputIds(); // TODO: test parentIds // parentIds = decoder.getParentIds(); ASSERT_TRUE(outputsIds); auto outputShape = outputsIds->getShape(); EXPECT_EQ(outputShape.nbDims, 3); EXPECT_EQ(outputShape.d[0], batchSize); EXPECT_EQ(outputShape.d[1], maxBeamWidth); EXPECT_EQ(outputShape.d[2], maxSeqLength); auto outputsIdsHost = manager.copyFrom(*outputsIds, MemoryType::kCPU); auto output = bufferCast(*outputsIdsHost); manager.getStream().synchronize(); for (auto b = 0; b < batchSize; ++b) { auto samplingConfig = samplingConfigs[b]; for (auto bw = 0; bw < samplingConfig.beamWidth; ++bw) { auto const result = (samplingConfig.beamWidth == 1) ? 1023 : bw; auto const outputPtr = output + tc::flat_index(outputShape.d, b, bw, 0); auto begin = outputPtr; auto end = outputPtr + inputLengths[b]; ASSERT_THAT(std::vector(begin, end), ::testing::Each(tokenId)) << "input tokens: " << "b:" << b << " bw: " << bw; begin = end; end = begin + nbNewTokens; ASSERT_THAT(std::vector(begin, end), ::testing::Each(result)) << "new tokens: " << "b:" << b << " bw: " << bw; begin = end; end = outputPtr + maxSeqLength; ASSERT_THAT(std::vector(begin, end), ::testing::Each(padId)) << "padding: " << "b:" << b << " bw: " << bw; } } } void testDecoder(nvinfer1::DataType const dtype, std::vector const& samplingConfigs, int maxBeamWidth) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); SizeType constexpr tensorParallelism{1}; SizeType constexpr pipelineParallelism{1}; SizeType constexpr localRank{0}; WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank}; SizeType constexpr vocabSize{51200}; SizeType constexpr nbLayers{2}; SizeType constexpr nbHeads{16}; SizeType constexpr hiddenSize{1024}; GptModelConfig modelConfig{vocabSize, nbLayers, nbHeads, hiddenSize, dtype}; modelConfig.useGptAttentionPlugin(false); auto streamPtr = std::make_shared(); BufferManager manager(streamPtr); // create decoder int constexpr endId{50257}; int constexpr padId{50257}; auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); auto decoder = GptDecoderBatch(vocabSize, vocabSizePadded, streamPtr); // setup decoder auto const batchSize = static_cast(samplingConfigs.size()); SizeType constexpr maxInputLength{8}; SizeType constexpr maxNewTokens{2}; auto constexpr maxSeqLength = maxInputLength + maxNewTokens; // We set maxKvCacheLength = maxSeqLength, but it can be smaller than maxSeqLength (cyclic kv cache). auto const maxKvCacheLength = maxSeqLength; decoder.setup(batchSize, maxBeamWidth, maxSeqLength, maxKvCacheLength, modelConfig.getDataType()); std::vector const inputLengths{4, 5, 6, 7}; std::vector tiledInputLengths; for (int batch_id = 0; batch_id < inputLengths.size(); batch_id++) { for (int beam_id = 0; beam_id < maxBeamWidth; beam_id++) { tiledInputLengths.push_back(inputLengths.at(batch_id)); } } // set up inputs auto logits = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, maxBeamWidth, vocabSizePadded}), modelConfig.getDataType())); manager.setZero(*logits); decoder_batch::Input inputs{logits}; if (maxBeamWidth > 1) { auto srcCacheIndirection = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, maxBeamWidth, maxSeqLength}), TRTDataType::value)); manager.setZero(*srcCacheIndirection); inputs.cacheIndirection = srcCacheIndirection; } // set up outputs decoder_batch::Output outputs{}; if (maxBeamWidth > 1) { auto tgtCacheIndirection = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, maxBeamWidth, maxSeqLength}), TRTDataType::value)); manager.setZero(*tgtCacheIndirection); outputs.cacheIndirection = tgtCacheIndirection; } auto sequenceLengths = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize * maxBeamWidth}), TRTDataType::value)); manager.copy(tiledInputLengths.data(), *sequenceLengths); outputs.sequenceLengths = sequenceLengths; auto constexpr tokenId = 1; std::vector inputIds; for (auto b = 0; b < batchSize; ++b) { auto shape = ITensor::makeShape({inputLengths[b]}); auto input = std::shared_ptr(manager.gpu(shape, TRTDataType::value)); kernels::invokeFill(*input, tokenId, *streamPtr); inputIds.emplace_back(input); decoder.newRequest(b, decoder_batch::Request{inputIds[b], maxNewTokens, endId, padId}, samplingConfigs[b]); } cudaDeviceSynchronize(); auto const& nbSteps = decoder.getNbSteps(); EXPECT_EQ(nbSteps.size(), batchSize); EXPECT_THAT(nbSteps, ::testing::Each(0)); auto const& finished = decoder.getFinished(); EXPECT_EQ(finished.size(), batchSize); EXPECT_THAT(finished, ::testing::Each(false)); verifyResults( manager, decoder, samplingConfigs, inputLengths, batchSize, maxBeamWidth, maxSeqLength, 0, tokenId, padId); // run decoder for 1 step decoder.forward(outputs, inputs); EXPECT_THAT(decoder.getNbSteps(), ::testing::Each(1)); EXPECT_THAT(decoder.getFinished(), ::testing::Each(false)); verifyResults( manager, decoder, samplingConfigs, inputLengths, batchSize, maxBeamWidth, maxSeqLength, 1, tokenId, padId); // run decoder for 1 step decoder.forward(outputs, inputs); EXPECT_THAT(decoder.getFinished(), ::testing::Each(true)); EXPECT_THAT(decoder.getNbSteps(), ::testing::Each(maxNewTokens)); verifyResults( manager, decoder, samplingConfigs, inputLengths, batchSize, maxBeamWidth, maxSeqLength, 2, tokenId, padId); EXPECT_NO_THROW(decoder.forward(outputs, inputs)); EXPECT_THAT(decoder.getNbSteps(), ::testing::Each(maxNewTokens)); decoder.newRequest(0, decoder_batch::Request{inputIds[0], maxNewTokens}, samplingConfigs[0]); EXPECT_FALSE(decoder.getFinished()[0]); EXPECT_EQ(decoder.getNbSteps()[0], 0); } void testDecoderWavefront( nvinfer1::DataType const dtype, std::vector const& samplingConfigs, int maxBeamWidth) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); SizeType constexpr tensorParallelism{1}; SizeType constexpr pipelineParallelism{1}; SizeType constexpr localRank{0}; WorldConfig constexpr worldConfig{tensorParallelism, pipelineParallelism, localRank}; SizeType constexpr vocabSize{51200}; SizeType constexpr nbLayers{2}; SizeType constexpr nbHeads{16}; SizeType constexpr hiddenSize{1024}; GptModelConfig modelConfig{vocabSize, nbLayers, nbHeads, hiddenSize, dtype}; modelConfig.useGptAttentionPlugin(false); auto streamPtr = std::make_shared(); BufferManager manager(streamPtr); // create decoder int constexpr endId{50257}; int constexpr padId{50257}; auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); auto decoder = GptDecoderBatch(vocabSize, vocabSizePadded, streamPtr); // setup decoder auto const batchSize = static_cast(samplingConfigs.size()); SizeType constexpr maxInputLength{8}; SizeType constexpr maxNewTokens{8}; auto constexpr maxSeqLength = maxInputLength + maxNewTokens; // We set maxKvCacheLength = maxSeqLength, but it can be smaller than maxSeqLength (cyclic kv cache). auto const maxKvCacheLength = maxSeqLength; decoder.setup(batchSize, maxBeamWidth, maxSeqLength, maxKvCacheLength, modelConfig.getDataType()); std::vector const inputLengths{4, 5, 6, 7}; std::vector tiledInputLengths; for (int batch_id = 0; batch_id < inputLengths.size(); batch_id++) { for (int beam_id = 0; beam_id < maxBeamWidth; beam_id++) { tiledInputLengths.push_back(inputLengths.at(batch_id)); } } // set up inputs auto logits = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, maxBeamWidth, vocabSizePadded}), modelConfig.getDataType())); manager.setZero(*logits); decoder_batch::Input inputs{logits}; if (maxBeamWidth > 1) { auto srcCacheIndirection = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, maxBeamWidth, maxSeqLength}), TRTDataType::value)); manager.setZero(*srcCacheIndirection); inputs.cacheIndirection = srcCacheIndirection; } // set up outputs decoder_batch::Output outputs{}; if (maxBeamWidth > 1) { auto tgtCacheIndirection = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, maxBeamWidth, maxSeqLength}), TRTDataType::value)); manager.setZero(*tgtCacheIndirection); outputs.cacheIndirection = tgtCacheIndirection; } auto sequenceLengths = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize * maxBeamWidth}), TRTDataType::value)); manager.copy(tiledInputLengths.data(), *sequenceLengths); outputs.sequenceLengths = sequenceLengths; auto const& nbSteps = decoder.getNbSteps(); EXPECT_EQ(nbSteps.size(), batchSize); std::vector expectedSteps(batchSize, 0); auto const& finished = decoder.getFinished(); EXPECT_EQ(finished.size(), batchSize); std::vector expectedFinished(batchSize, true); auto constexpr tokenId = 1; std::vector inputIds; for (auto b = 0; b < batchSize; ++b) { auto shape = ITensor::makeShape({inputLengths[b]}); auto input = std::shared_ptr(manager.gpu(shape, TRTDataType::value)); kernels::invokeFill(*input, tokenId, *streamPtr); inputIds.emplace_back(input); decoder.newRequest(b, decoder_batch::Request{inputIds[b], maxNewTokens, endId, padId}, samplingConfigs[b]); decoder.forward(outputs, inputs); for (auto i = 0; i < inputIds.size(); ++i) { expectedSteps[i] = std::min(expectedSteps[i] + 1, maxNewTokens); expectedFinished[i] = expectedSteps[i] == maxNewTokens; } EXPECT_THAT(decoder.getNbSteps(), ::testing::ElementsAreArray(expectedSteps)); EXPECT_THAT(decoder.getFinished(), ::testing::ElementsAreArray(expectedFinished)); } while (!decoder.getFinished().back()) { decoder.forward(outputs, inputs); } EXPECT_THAT(decoder.getFinished(), ::testing::Each(true)); EXPECT_THAT(decoder.getNbSteps(), ::testing::Each(maxNewTokens)); verifyResults(manager, decoder, samplingConfigs, inputLengths, batchSize, maxBeamWidth, maxSeqLength, maxNewTokens, tokenId, padId); } } // namespace struct BeamConfig { SizeType maxBeamWidth; std::vector beamWidths; }; class ParamTest : public ::testing::TestWithParam> { }; TEST_P(ParamTest, Test) { nvinfer1::DataType const dtype{std::get<0>(GetParam())}; BeamConfig const beamConfig{std::get<1>(GetParam())}; std::vector samplingConfigs; for (auto const beamWidth : beamConfig.beamWidths) { samplingConfigs.emplace_back(beamWidth); } testDecoder(dtype, samplingConfigs, beamConfig.maxBeamWidth); } INSTANTIATE_TEST_SUITE_P(GptDecoderTest, ParamTest, testing::Combine(testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kHALF), testing::Values(BeamConfig{1, {1, 1, 1}}, BeamConfig{3, {3, 3, 3, 3}}, BeamConfig{4, {1, 1}}, BeamConfig{4, {3, 3, 3}}, BeamConfig{4, {1, 2, 3, 4}})), [](const testing::TestParamInfo& info) { std::string name{std::get<0>(info.param) == nvinfer1::DataType::kFLOAT ? "Float" : "Half"}; BeamConfig const beamConfig = std::get<1>(info.param); name.append("MaxBeamWidth" + std::to_string(beamConfig.maxBeamWidth)); for (auto const beamWdith : beamConfig.beamWidths) { name.append("Bw" + std::to_string(beamWdith)); } return name; }); class ParamWavefrontTest : public ::testing::TestWithParam> { }; TEST_P(ParamWavefrontTest, Test) { nvinfer1::DataType const dtype{std::get<0>(GetParam())}; BeamConfig const beamConfig{std::get<1>(GetParam())}; std::vector samplingConfigs; for (auto const beamWidth : beamConfig.beamWidths) { samplingConfigs.emplace_back(beamWidth); } testDecoderWavefront(dtype, samplingConfigs, beamConfig.maxBeamWidth); } INSTANTIATE_TEST_SUITE_P(GptDecoderTest, ParamWavefrontTest, testing::Combine(testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kHALF), testing::Values(BeamConfig{1, {1, 1, 1}}, BeamConfig{3, {3, 3, 3, 3}}, BeamConfig{4, {1, 1}}, BeamConfig{4, {3, 3, 3}}, BeamConfig{4, {1, 2, 3, 4}})), [](const testing::TestParamInfo& info) { std::string name{std::get<0>(info.param) == nvinfer1::DataType::kFLOAT ? "Float" : "Half"}; BeamConfig const beamConfig = std::get<1>(info.param); name.append("MaxBeamWidth" + std::to_string(beamConfig.maxBeamWidth)); for (auto const beamWdith : beamConfig.beamWidths) { name.append("Bw" + std::to_string(beamWdith)); } return name; });