From 12763779c4bb74ce32b8c53916a0363e514fcbaf Mon Sep 17 00:00:00 2001 From: Robin Kobus <19427718+Funatiq@users.noreply.github.com> Date: Wed, 28 May 2025 16:32:59 +0200 Subject: [PATCH] chore: Clean up cpp runtime (#4449) Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --- .../batch_manager/transformerBuffers.h | 2 - .../tensorrt_llm/runtime/decoderState.h | 6 +++ .../createNewDecoderRequests.cpp | 7 ++- .../batch_manager/transformerBuffers.cpp | 12 ----- .../trtGptModelInflightBatching.cpp | 7 ++- .../batch_manager/updateDecoderBuffers.cpp | 3 +- cpp/tensorrt_llm/runtime/decoderState.cpp | 50 ++++++++----------- .../runtime/gptDecoderBatched.cpp | 3 +- cpp/tensorrt_llm/runtime/runtimeKernels.h | 2 +- cpp/tests/runtime/gptDecoderBatchedTest.cpp | 22 ++++---- 10 files changed, 47 insertions(+), 67 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h b/cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h index 6ce278110c..17f23c8c12 100644 --- a/cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h +++ b/cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h @@ -119,8 +119,6 @@ public: void getBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::ModelConfig const& modelConfig) const; - void reshapePositionIds(std::vector const& positionIdsHost, bool isChatGlm); - void copyPositionIds(runtime::TllmRuntime const& runtime, std::vector const& positionIdsHost, bool isChatGlm, TensorPtr const& decoderPositionIds); diff --git a/cpp/include/tensorrt_llm/runtime/decoderState.h b/cpp/include/tensorrt_llm/runtime/decoderState.h index ca07aba745..934d100329 100644 --- a/cpp/include/tensorrt_llm/runtime/decoderState.h +++ b/cpp/include/tensorrt_llm/runtime/decoderState.h @@ -118,6 +118,10 @@ public: //! @returns [batchSize, maxBeamWidth], sequence lengths, on gpu [[nodiscard]] TensorPtr getSequenceLengths() const; + //! @param batchIdx index of the batch + //! @returns [maxBeamWidth], sequence lengths for request `batchIdx`, on gpu + [[nodiscard]] TensorPtr getSequenceLengths(SizeType32 batchIdx) const; + //! @brief Get maxTokensPerStep tokens generated in the last forward pass //! @returns [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu [[nodiscard]] TensorPtr getAllNewTokens() const; @@ -140,6 +144,8 @@ public: //! @returns [maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu [[nodiscard]] TensorPtr getFinishedSteps() const; + [[nodiscard]] SizeType32 getMaxBatchSize() const; + [[nodiscard]] SizeType32 getMaxBeamWidth() const; [[nodiscard]] SizeType32 getMaxSequenceLength() const; diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp index b736c45d22..c4a5e8febe 100644 --- a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp +++ b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp @@ -169,13 +169,12 @@ void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder BufferManager manager{std::make_shared(decoderStream.get())}; - auto const& jointOutputIdsShape = decoderState.getJointDecodingOutput().ids->getShape(); - auto const batchSize = jointOutputIdsShape.d[0]; + auto const batchSize = decoderState.getMaxBatchSize(); TLLM_CHECK(0 <= batchSize && batchSlot < batchSize); - auto const maxBeamWidth = jointOutputIdsShape.d[1]; + auto const maxBeamWidth = decoderState.getMaxBeamWidth(); auto const beamWidth = samplingConfig.beamWidth; TLLM_CHECK_WITH_INFO(beamWidth <= maxBeamWidth, - tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (" FMT_DIM ") passed to decoder setup function.", + tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (%d) passed to decoder setup function.", beamWidth, maxBeamWidth)); auto const& requestIds = request.ids; auto const inputLength = request.inputLen; diff --git a/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp b/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp index 42b1a89712..5810344a01 100644 --- a/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp +++ b/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp @@ -313,18 +313,6 @@ void TransformerBuffers::getBuffers( TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void TransformerBuffers::reshapePositionIds(std::vector const& positionIdsHost, bool isChatGlm) -{ - if (isChatGlm) - { - positionIds->reshape(ITensor::makeShape({2, static_cast(positionIdsHost.size()) / 2})); - } - else - { - positionIds->reshape(ITensor::makeShape({static_cast(positionIdsHost.size())})); - } -} - void TransformerBuffers::copyPositionIds(runtime::TllmRuntime const& runtime, std::vector const& positionIdsHost, bool isChatGlm, TensorPtr const& decoderPositionIds) { diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp index 0a1b6f03ec..e9dd6325e5 100644 --- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp +++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp @@ -1906,13 +1906,12 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs( // Make sure that postprocessing is done before copying outputIds mCopyBufferManager.getStream().wait(event.get()); - TensorPtr sequenceLengthView - = ITensor::slice(mDecoder->getDecoderState().getJointDecodingOutput().lengths, seqSlot, 1); + auto sequenceLengths = mDecoder->getDecoderState().getSequenceLengths(seqSlot); auto outputIds = mDecoder->getDecoderState().getGatheredIds(seqSlot); auto cumLogProbs = mDecoder->getDecoderState().getCumLogProbs(seqSlot); auto logProbs = mDecoder->getDecoderState().getLogProbs(seqSlot); - mCopyBufferManager.copy(*sequenceLengthView, *mSlotDecoderBuffers[seqSlot]->sequenceLengths); + mCopyBufferManager.copy(*sequenceLengths, *mSlotDecoderBuffers[seqSlot]->sequenceLengths); mCopyBufferManager.copy(*outputIds, *mSlotDecoderBuffers[seqSlot]->outputIds); if (returnLogProbs) { @@ -1927,7 +1926,7 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs( auto const peerSend = 0; mDecSlotAsyncSndHdls.emplace_back(std::make_unique( - outputIds, sequenceLengthView, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend)); + outputIds, sequenceLengths, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend)); } } else diff --git a/cpp/tensorrt_llm/batch_manager/updateDecoderBuffers.cpp b/cpp/tensorrt_llm/batch_manager/updateDecoderBuffers.cpp index 83ab19ed33..0b94eb4f7e 100644 --- a/cpp/tensorrt_llm/batch_manager/updateDecoderBuffers.cpp +++ b/cpp/tensorrt_llm/batch_manager/updateDecoderBuffers.cpp @@ -40,8 +40,7 @@ runtime::CudaEvent UpdateDecoderBuffers::operator()(runtime::ModelConfig const& copyBufferManager.getStream().wait(decoderFinishEvent); copyBufferManager.copy(*decoder.getDecoderState().getAllNewTokens(), *decoderOutputBuffers.newOutputTokensHost); - copyBufferManager.copy( - *decoder.getDecoderState().getJointDecodingOutput().lengths, *decoderOutputBuffers.sequenceLengthsHost); + copyBufferManager.copy(*decoder.getDecoderState().getSequenceLengths(), *decoderOutputBuffers.sequenceLengthsHost); auto const finishedSumDevice = decoder.getDecoderState().getFinishedSum(); copyBufferManager.copy(*finishedSumDevice, *decoderOutputBuffers.finishedSumHost); diff --git a/cpp/tensorrt_llm/runtime/decoderState.cpp b/cpp/tensorrt_llm/runtime/decoderState.cpp index db9ae904f7..c86b770575 100644 --- a/cpp/tensorrt_llm/runtime/decoderState.cpp +++ b/cpp/tensorrt_llm/runtime/decoderState.cpp @@ -417,75 +417,57 @@ void DecoderState::disableLookahead(RequestVector const& genRequests) TensorPtr DecoderState::getFinishedSum() const { - return ITensor::slice(mJointDecodingOutput->finishedSum, 0, mMaxBatchSize); + return mJointDecodingOutput->finishedSum; } TensorPtr DecoderState::getFinishReasons() const { - return ITensor::slice(mJointDecodingOutput->finishReasons, 0, mMaxBatchSize); + return mJointDecodingOutput->finishReasons; } TensorPtr DecoderState::getIds() const { - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto tensor = ITensor::slice(mJointDecodingOutput->ids, 0, mMaxBatchSize); - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return tensor; + return mJointDecodingOutput->ids; } TensorPtr DecoderState::getIds(SizeType32 batchIdx) const { - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto tensor = ITensor::slice(mJointDecodingOutput->ids, batchIdx, 1); - tensor->squeeze(0); - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return tensor; + return ITensor::at(mJointDecodingOutput->ids, {batchIdx}); } TensorPtr DecoderState::getGatheredIds() const { - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, 0, mMaxBatchSize); - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return tensor; + return mJointDecodingOutput->gatheredIds; } TensorPtr DecoderState::getGatheredIds(SizeType32 batchIdx) const { - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, batchIdx, 1); - tensor->squeeze(0); - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return tensor; + return ITensor::at(mJointDecodingOutput->gatheredIds, {batchIdx}); } TensorPtr DecoderState::getParentIds() const { - return ITensor::slice(mJointDecodingOutput->parentIds, 0, mMaxBatchSize); + return mJointDecodingOutput->parentIds; } TensorPtr DecoderState::getCumLogProbs() const { - return ITensor::slice(mJointDecodingOutput->cumLogProbs, 0, mMaxBatchSize); + return mJointDecodingOutput->cumLogProbs; } TensorPtr DecoderState::getCumLogProbs(SizeType32 batchIdx) const { - auto tensor = ITensor::slice(mJointDecodingOutput->cumLogProbs, batchIdx, 1); - tensor->squeeze(0); - return tensor; + return ITensor::at(mJointDecodingOutput->cumLogProbs, {batchIdx}); } TensorPtr DecoderState::getLogProbs() const { - return ITensor::slice(mJointDecodingOutput->logProbs, 0, mMaxBatchSize); + return mJointDecodingOutput->logProbs; } TensorPtr DecoderState::getLogProbs(SizeType32 batchIdx) const { - auto tensor = ITensor::slice(mJointDecodingOutput->logProbs, batchIdx, 1); - tensor->squeeze(0); - return tensor; + return ITensor::at(mJointDecodingOutput->logProbs, {batchIdx}); } TensorPtr DecoderState::getSequenceLengths() const @@ -493,6 +475,11 @@ TensorPtr DecoderState::getSequenceLengths() const return mJointDecodingOutput->lengths; } +TensorPtr DecoderState::getSequenceLengths(SizeType32 batchIdx) const +{ + return ITensor::at(mJointDecodingOutput->lengths, {batchIdx}); +} + TensorPtr DecoderState::getAllNewTokens() const { return mJointDecodingOutput->newTokensSteps; @@ -528,6 +515,11 @@ TensorPtr DecoderState::getFinishedSteps() const return mFinishedSteps; } +SizeType32 DecoderState::getMaxBatchSize() const +{ + return mMaxBatchSize; +} + SizeType32 DecoderState::getMaxBeamWidth() const { return mMaxBeamWidth; diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp index cfb896ac0b..aabf246ab5 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp @@ -208,8 +208,7 @@ void GptDecoderBatched::prepareForward( { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto const& jointOutputIdsShape = mDecoderState->getJointDecodingOutput().ids->getShape(); - auto const maxBeamWidth = jointOutputIdsShape.d[1]; + auto const maxBeamWidth = mDecoderState->getMaxBeamWidth(); auto const speculativeDecodingMode = mDecoderState->getSpeculativeDecodingMode(); auto& dInput = mDecoderState->getJointDecodingInput(); diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.h b/cpp/tensorrt_llm/runtime/runtimeKernels.h index 27e8df379b..f1c34ace66 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.h +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.h @@ -48,7 +48,7 @@ void scatterTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, CudaStream const& stream); void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output, - std::vector const& inputVector, ITensor& cachePointerDevice, ITensor& cachePointerHost, + std::vector const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost, SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream, int stepOffset); diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp index 1aa919e95e..fd6093e486 100644 --- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp +++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp @@ -325,7 +325,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType, samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager); auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths, - *decoder.getDecoderState().getJointDecodingOutput().lengths, manager); + *decoder.getDecoderState().getSequenceLengths(), manager); std::vector decoderRequests; newRequests( @@ -333,7 +333,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa cudaDeviceSynchronize(); auto expectedLengths = tiledInputLengths; - checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager); + checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager); auto const& finished = getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager); EXPECT_EQ(finished.size(), batchSize); @@ -352,7 +352,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots, decoderInputs.srcCacheIndirection); decoder.forward(outputs, *inputs); - checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager); + checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager); EXPECT_THAT( getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(false)); @@ -363,7 +363,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa advanceSequenceLengths(expectedLengths, acceptedTokensPerStep, samplingConfigs, getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), batchSize, maxBeamWidth); decoder.forward(outputs, *inputs); - checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager); + checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager); EXPECT_THAT( getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(true)); @@ -371,7 +371,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa maxSeqLength, inputTokenId, expectedTokenId, endId); EXPECT_NO_THROW(decoder.forward(outputs, *inputs)); - checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager); + checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager); TensorPtr batchSlotsView = ITensor::slice(inputBuffers.setupBatchSlots, 0, 1); std::vector singleConfig = {samplingConfigs[0]}; @@ -454,7 +454,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector const expectedSteps(batchSize, 0); auto expectedLengths = tiledInputLengths; @@ -480,7 +480,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector(*inputBuffers.setupBatchSlots); std::iota(batchSlotsRange.begin(), batchSlotsRange.end(), 0); @@ -609,7 +609,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector