/* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/gptDecoder.h" #include "tensorrt_llm/runtime/modelConfig.h" #include "tensorrt_llm/runtime/worldConfig.h" using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; namespace tle = tensorrt_llm::executor; namespace { bool forwardAndSync(std::unique_ptr const& decoder, DecodingOutput& output, DecodingInput const& input, std::shared_ptr stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto const maxBatchSize = input.batchSize; BufferManager::ITensorPtr finishedSum; std::int32_t* finishedSumHost = nullptr; if (input.sequenceLimitLength && output.finished) { finishedSumHost = bufferCast(*output.finishedSum); for (SizeType32 bi = 0; bi < maxBatchSize; ++bi) { finishedSumHost[bi] = 0; } } decoder->forwardAsync(output, input); if (finishedSumHost) { auto const numToFinish = output.finished->getSize(); TLLM_CUDA_CHECK(::cudaStreamSynchronize(stream->get())); SizeType32 finishedSum = 0; for (SizeType32 bi = 0; bi < maxBatchSize; ++bi) { finishedSum += finishedSumHost[bi]; } return numToFinish == static_cast(finishedSum); } else { return false; } } void testDecoder(nvinfer1::DataType const dtype, SamplingConfig const& samplingConfig) { SizeType32 constexpr tensorParallelism{1}; SizeType32 constexpr pipelineParallelism{1}; SizeType32 constexpr localRank{0}; WorldConfig const worldConfig{tensorParallelism, pipelineParallelism, localRank}; SizeType32 constexpr vocabSize{51200}; SizeType32 constexpr nbLayers{2}; SizeType32 constexpr nbRnnLayers{0}; SizeType32 constexpr nbHeads{16}; SizeType32 constexpr hiddenSize{1024}; SizeType32 constexpr batchSize{4}; ModelConfig modelConfig{vocabSize, nbLayers, nbRnnLayers, nbHeads, hiddenSize, dtype}; modelConfig.useGptAttentionPlugin(false); SizeType32 constexpr maxInputLength{8}; SizeType32 constexpr maxNewTokens{2}; SizeType32 constexpr sinkTokenLength{0}; auto constexpr maxSeqLength = maxInputLength + maxNewTokens; auto streamPtr = std::make_shared(); BufferManager manager(streamPtr); // setup decoder auto const beamWidth = samplingConfig.beamWidth; auto const decodingMode = beamWidth == 1 ? tle::DecodingMode::TopKTopP() : tle::DecodingMode::BeamSearch(); // create decoder auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); auto decoder = IGptDecoder::create(decodingMode, modelConfig.getDataType(), batchSize, beamWidth, vocabSize, vocabSizePadded, maxSeqLength, streamPtr); ASSERT_TRUE(static_cast(decoder)); auto batchSlots = getDefaultBatchSlots(batchSize, manager); decoder->setup(samplingConfig, batchSize, batchSlots); // set up inputs auto logits = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, beamWidth, vocabSizePadded}), modelConfig.getDataType())); manager.setZero(*logits); int constexpr endId{50257}; std::vector const endIdsVec(batchSize * beamWidth, endId); auto endIds = std::shared_ptr(manager.copyFrom(endIdsVec, ITensor::makeShape({batchSize, beamWidth}), MemoryType::kGPU)); DecodingInput inputs{maxInputLength, maxSeqLength, sinkTokenLength, batchSize, logits, endIds, batchSlots}; std::vector inputLengthsVec(batchSize * beamWidth, 0); inputs.lengths = manager.copyFrom(inputLengthsVec, ITensor::makeShape({batchSize * beamWidth}), MemoryType::kGPU); std::vector sequenceLimitLengthsVec(batchSize, maxSeqLength); inputs.sequenceLimitLength = manager.copyFrom(sequenceLimitLengthsVec, ITensor::makeShape({batchSize}), MemoryType::kGPU); if (beamWidth > 1) { auto srcCacheIndirection = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32)); manager.setZero(*srcCacheIndirection); inputs.cacheIndirection = srcCacheIndirection; } // set up outputs auto outputIds = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32)); manager.setZero(*outputIds); auto gatheredOutputIds = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32)); manager.setZero(*gatheredOutputIds); DecodingOutput outputs{outputIds, gatheredOutputIds}; auto newTokens = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32)); manager.setZero(*newTokens); outputs.newTokens = newTokens; std::vector sequenceLengthsVec(batchSize * beamWidth, maxInputLength); outputs.lengths = manager.copyFrom(sequenceLengthsVec, ITensor::makeShape({batchSize, beamWidth}), MemoryType::kGPU); outputs.finished = manager.gpu(ITensor::makeShape({batchSize, beamWidth}), TRTDataType::value); inputs.finished = ITensor::view(outputs.finished); manager.setZero(*outputs.finished); outputs.finishedSum = BufferManager::pinnedPool(ITensor::makeShape({batchSize}), nvinfer1::DataType::kINT32); auto finishedSumHost = bufferCast(*outputs.finishedSum); for (SizeType32 bi = 0; bi < batchSize; ++bi) { finishedSumHost[bi] = -1; } if (beamWidth > 1) { auto tgtCacheIndirection = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32)); manager.setZero(*tgtCacheIndirection); outputs.cacheIndirection = tgtCacheIndirection; auto cumLogProbs = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kFLOAT)); manager.setZero(*cumLogProbs); outputs.cumLogProbs = cumLogProbs; auto parentIds = std::shared_ptr( manager.gpu(ITensor::makeShape({batchSize, beamWidth, maxSeqLength}), nvinfer1::DataType::kINT32)); manager.setZero(*parentIds); outputs.parentIds = parentIds; } // run decoder EXPECT_FALSE(forwardAndSync(decoder, outputs, inputs, streamPtr)); inputs.step += 1; { SizeType32 finishedSum = 0; for (SizeType32 bi = 0; bi < batchSize; ++bi) { finishedSum += finishedSumHost[bi]; } EXPECT_EQ(finishedSum, 0); } // verify results auto outputsIdsHost = manager.copyFrom(*outputs.ids, MemoryType::kCPU); auto output = bufferCast(*outputsIdsHost); manager.getStream().synchronize(); for (auto b = 0; b < batchSize; ++b) { for (auto bw = 0; bw < beamWidth; ++bw) { auto const result = (beamWidth == 1) ? 1023 : bw; bool anyMismatch = false; for (auto i = 0; i < maxInputLength; ++i) { auto const outputIndex = tc::flat_index3(b, bw, i, beamWidth, maxSeqLength); EXPECT_EQ(output[outputIndex], 0) << " b: " << b << " bw: " << bw << " i: " << i; anyMismatch |= (output[outputIndex] != 0); } for (auto i = 0; i < maxNewTokens - 1; ++i) { auto const index = tc::flat_index3(b, bw, maxInputLength + i, beamWidth, maxSeqLength); EXPECT_EQ(output[index], result) << " b: " << b << " bw: " << bw << " i: " << i; anyMismatch |= (output[index] != result); } ASSERT_FALSE(anyMismatch); } } // run decoder again EXPECT_TRUE(forwardAndSync(decoder, outputs, inputs, streamPtr)); { SizeType32 finishedSum = 0; for (SizeType32 bi = 0; bi < batchSize; ++bi) { finishedSum += finishedSumHost[bi]; } EXPECT_EQ(finishedSum, outputs.finished->getSize()); } } } // namespace class ParamTest : public ::testing::TestWithParam> { }; TEST_P(ParamTest, Test) { nvinfer1::DataType const dtype{std::get<0>(GetParam())}; SizeType32 const beamWidth{std::get<1>(GetParam())}; SamplingConfig const samplingConfig{beamWidth}; testDecoder(dtype, samplingConfig); } INSTANTIATE_TEST_SUITE_P(DecoderTest, ParamTest, testing::Combine(testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kHALF), testing::Values(1, 3)), [](testing::TestParamInfo const& info) { std::string name{std::get<0>(info.param) == nvinfer1::DataType::kFLOAT ? "Float" : "Half"}; auto const beamWidth = std::get<1>(info.param); name.append(beamWidth == 1 ? "Sampling" : "BeamWidth" + std::to_string(beamWidth)); return name; });