chore: Clean up cpp runtime (#4449)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
This commit is contained in:
Robin Kobus 2025-05-28 16:32:59 +02:00 committed by GitHub
parent ed3c67e34a
commit 12763779c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 47 additions and 67 deletions

View File

@ -119,8 +119,6 @@ public:
void getBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::ModelConfig const& modelConfig) const;
void reshapePositionIds(std::vector<SizeType32> const& positionIdsHost, bool isChatGlm);
void copyPositionIds(runtime::TllmRuntime const& runtime, std::vector<SizeType32> const& positionIdsHost,
bool isChatGlm, TensorPtr const& decoderPositionIds);

View File

@ -118,6 +118,10 @@ public:
//! @returns [batchSize, maxBeamWidth], sequence lengths, on gpu
[[nodiscard]] TensorPtr getSequenceLengths() const;
//! @param batchIdx index of the batch
//! @returns [maxBeamWidth], sequence lengths for request `batchIdx`, on gpu
[[nodiscard]] TensorPtr getSequenceLengths(SizeType32 batchIdx) const;
//! @brief Get maxTokensPerStep tokens generated in the last forward pass
//! @returns [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
[[nodiscard]] TensorPtr getAllNewTokens() const;
@ -140,6 +144,8 @@ public:
//! @returns [maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
[[nodiscard]] TensorPtr getFinishedSteps() const;
[[nodiscard]] SizeType32 getMaxBatchSize() const;
[[nodiscard]] SizeType32 getMaxBeamWidth() const;
[[nodiscard]] SizeType32 getMaxSequenceLength() const;

View File

@ -169,13 +169,12 @@ void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder
BufferManager manager{std::make_shared<CudaStream>(decoderStream.get())};
auto const& jointOutputIdsShape = decoderState.getJointDecodingOutput().ids->getShape();
auto const batchSize = jointOutputIdsShape.d[0];
auto const batchSize = decoderState.getMaxBatchSize();
TLLM_CHECK(0 <= batchSize && batchSlot < batchSize);
auto const maxBeamWidth = jointOutputIdsShape.d[1];
auto const maxBeamWidth = decoderState.getMaxBeamWidth();
auto const beamWidth = samplingConfig.beamWidth;
TLLM_CHECK_WITH_INFO(beamWidth <= maxBeamWidth,
tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (" FMT_DIM ") passed to decoder setup function.",
tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (%d) passed to decoder setup function.",
beamWidth, maxBeamWidth));
auto const& requestIds = request.ids;
auto const inputLength = request.inputLen;

View File

@ -313,18 +313,6 @@ void TransformerBuffers::getBuffers(
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void TransformerBuffers::reshapePositionIds(std::vector<SizeType32> const& positionIdsHost, bool isChatGlm)
{
if (isChatGlm)
{
positionIds->reshape(ITensor::makeShape({2, static_cast<int>(positionIdsHost.size()) / 2}));
}
else
{
positionIds->reshape(ITensor::makeShape({static_cast<int>(positionIdsHost.size())}));
}
}
void TransformerBuffers::copyPositionIds(runtime::TllmRuntime const& runtime,
std::vector<SizeType32> const& positionIdsHost, bool isChatGlm, TensorPtr const& decoderPositionIds)
{

View File

@ -1906,13 +1906,12 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(
// Make sure that postprocessing is done before copying outputIds
mCopyBufferManager.getStream().wait(event.get());
TensorPtr sequenceLengthView
= ITensor::slice(mDecoder->getDecoderState().getJointDecodingOutput().lengths, seqSlot, 1);
auto sequenceLengths = mDecoder->getDecoderState().getSequenceLengths(seqSlot);
auto outputIds = mDecoder->getDecoderState().getGatheredIds(seqSlot);
auto cumLogProbs = mDecoder->getDecoderState().getCumLogProbs(seqSlot);
auto logProbs = mDecoder->getDecoderState().getLogProbs(seqSlot);
mCopyBufferManager.copy(*sequenceLengthView, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
mCopyBufferManager.copy(*sequenceLengths, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
mCopyBufferManager.copy(*outputIds, *mSlotDecoderBuffers[seqSlot]->outputIds);
if (returnLogProbs)
{
@ -1927,7 +1926,7 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(
auto const peerSend = 0;
mDecSlotAsyncSndHdls.emplace_back(std::make_unique<DecoderSlotAsyncSend>(
outputIds, sequenceLengthView, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend));
outputIds, sequenceLengths, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend));
}
}
else

View File

@ -40,8 +40,7 @@ runtime::CudaEvent UpdateDecoderBuffers::operator()(runtime::ModelConfig const&
copyBufferManager.getStream().wait(decoderFinishEvent);
copyBufferManager.copy(*decoder.getDecoderState().getAllNewTokens(), *decoderOutputBuffers.newOutputTokensHost);
copyBufferManager.copy(
*decoder.getDecoderState().getJointDecodingOutput().lengths, *decoderOutputBuffers.sequenceLengthsHost);
copyBufferManager.copy(*decoder.getDecoderState().getSequenceLengths(), *decoderOutputBuffers.sequenceLengthsHost);
auto const finishedSumDevice = decoder.getDecoderState().getFinishedSum();
copyBufferManager.copy(*finishedSumDevice, *decoderOutputBuffers.finishedSumHost);

View File

@ -417,75 +417,57 @@ void DecoderState::disableLookahead(RequestVector const& genRequests)
TensorPtr DecoderState::getFinishedSum() const
{
return ITensor::slice(mJointDecodingOutput->finishedSum, 0, mMaxBatchSize);
return mJointDecodingOutput->finishedSum;
}
TensorPtr DecoderState::getFinishReasons() const
{
return ITensor::slice(mJointDecodingOutput->finishReasons, 0, mMaxBatchSize);
return mJointDecodingOutput->finishReasons;
}
TensorPtr DecoderState::getIds() const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto tensor = ITensor::slice(mJointDecodingOutput->ids, 0, mMaxBatchSize);
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
return tensor;
return mJointDecodingOutput->ids;
}
TensorPtr DecoderState::getIds(SizeType32 batchIdx) const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto tensor = ITensor::slice(mJointDecodingOutput->ids, batchIdx, 1);
tensor->squeeze(0);
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
return tensor;
return ITensor::at(mJointDecodingOutput->ids, {batchIdx});
}
TensorPtr DecoderState::getGatheredIds() const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, 0, mMaxBatchSize);
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
return tensor;
return mJointDecodingOutput->gatheredIds;
}
TensorPtr DecoderState::getGatheredIds(SizeType32 batchIdx) const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, batchIdx, 1);
tensor->squeeze(0);
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
return tensor;
return ITensor::at(mJointDecodingOutput->gatheredIds, {batchIdx});
}
TensorPtr DecoderState::getParentIds() const
{
return ITensor::slice(mJointDecodingOutput->parentIds, 0, mMaxBatchSize);
return mJointDecodingOutput->parentIds;
}
TensorPtr DecoderState::getCumLogProbs() const
{
return ITensor::slice(mJointDecodingOutput->cumLogProbs, 0, mMaxBatchSize);
return mJointDecodingOutput->cumLogProbs;
}
TensorPtr DecoderState::getCumLogProbs(SizeType32 batchIdx) const
{
auto tensor = ITensor::slice(mJointDecodingOutput->cumLogProbs, batchIdx, 1);
tensor->squeeze(0);
return tensor;
return ITensor::at(mJointDecodingOutput->cumLogProbs, {batchIdx});
}
TensorPtr DecoderState::getLogProbs() const
{
return ITensor::slice(mJointDecodingOutput->logProbs, 0, mMaxBatchSize);
return mJointDecodingOutput->logProbs;
}
TensorPtr DecoderState::getLogProbs(SizeType32 batchIdx) const
{
auto tensor = ITensor::slice(mJointDecodingOutput->logProbs, batchIdx, 1);
tensor->squeeze(0);
return tensor;
return ITensor::at(mJointDecodingOutput->logProbs, {batchIdx});
}
TensorPtr DecoderState::getSequenceLengths() const
@ -493,6 +475,11 @@ TensorPtr DecoderState::getSequenceLengths() const
return mJointDecodingOutput->lengths;
}
TensorPtr DecoderState::getSequenceLengths(SizeType32 batchIdx) const
{
return ITensor::at(mJointDecodingOutput->lengths, {batchIdx});
}
TensorPtr DecoderState::getAllNewTokens() const
{
return mJointDecodingOutput->newTokensSteps;
@ -528,6 +515,11 @@ TensorPtr DecoderState::getFinishedSteps() const
return mFinishedSteps;
}
SizeType32 DecoderState::getMaxBatchSize() const
{
return mMaxBatchSize;
}
SizeType32 DecoderState::getMaxBeamWidth() const
{
return mMaxBeamWidth;

View File

@ -208,8 +208,7 @@ void GptDecoderBatched::prepareForward(
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto const& jointOutputIdsShape = mDecoderState->getJointDecodingOutput().ids->getShape();
auto const maxBeamWidth = jointOutputIdsShape.d[1];
auto const maxBeamWidth = mDecoderState->getMaxBeamWidth();
auto const speculativeDecodingMode = mDecoderState->getSpeculativeDecodingMode();
auto& dInput = mDecoderState->getJointDecodingInput();

View File

@ -48,7 +48,7 @@ void scatterTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth,
void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, CudaStream const& stream);
void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output,
std::vector<TensorPtr> const& inputVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
std::vector<TensorPtr> const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream,
int stepOffset);

View File

@ -325,7 +325,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager);
auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
*decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
*decoder.getDecoderState().getSequenceLengths(), manager);
std::vector<decoder_batch::Request> decoderRequests;
newRequests(
@ -333,7 +333,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
cudaDeviceSynchronize();
auto expectedLengths = tiledInputLengths;
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
auto const& finished = getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager);
EXPECT_EQ(finished.size(), batchSize);
@ -352,7 +352,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots, decoderInputs.srcCacheIndirection);
decoder.forward(outputs, *inputs);
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
EXPECT_THAT(
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));
@ -363,7 +363,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
advanceSequenceLengths(expectedLengths, acceptedTokensPerStep, samplingConfigs,
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), batchSize, maxBeamWidth);
decoder.forward(outputs, *inputs);
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
EXPECT_THAT(
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(true));
@ -371,7 +371,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
maxSeqLength, inputTokenId, expectedTokenId, endId);
EXPECT_NO_THROW(decoder.forward(outputs, *inputs));
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
TensorPtr batchSlotsView = ITensor::slice(inputBuffers.setupBatchSlots, 0, 1);
std::vector<SamplingConfig> singleConfig = {samplingConfigs[0]};
@ -454,7 +454,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager);
auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
*decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
*decoder.getDecoderState().getSequenceLengths(), manager);
std::vector<SizeType32> const expectedSteps(batchSize, 0);
auto expectedLengths = tiledInputLengths;
@ -480,7 +480,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
advanceSequenceLengths(
expectedLengths, acceptedTokensPerStep, samplingConfigs, expectedFinished, batchIdx + 1, maxBeamWidth);
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
for (auto bi = 0; bi <= batchIdx; ++bi)
{
@ -504,7 +504,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
advanceSequenceLengths(
expectedLengths, acceptedTokensPerStep, samplingConfigs, expectedFinished, batchSize, maxBeamWidth);
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
for (auto bi = 0; bi < batchSize; ++bi)
{
@ -599,7 +599,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
samplingConfigs, generatedTokensPerSteps, false, manager);
auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
*decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
*decoder.getDecoderState().getSequenceLengths(), manager);
auto batchSlotsRange = BufferRange<SizeType32>(*inputBuffers.setupBatchSlots);
std::iota(batchSlotsRange.begin(), batchSlotsRange.end(), 0);
@ -609,7 +609,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
cudaDeviceSynchronize();
auto expectedLengths = tiledInputLengths;
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
auto const& finished = getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager);
EXPECT_EQ(finished.size(), batchSize);
@ -627,7 +627,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
auto inputs = tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(activeSlots, decoder.getDecoderState(),
decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots, decoderInputs.srcCacheIndirection);
decoder.forward(outputs, *inputs);
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
EXPECT_THAT(
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));