mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
chore: Clean up cpp runtime (#4449)
Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
This commit is contained in:
parent
ed3c67e34a
commit
12763779c4
@ -119,8 +119,6 @@ public:
|
||||
|
||||
void getBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::ModelConfig const& modelConfig) const;
|
||||
|
||||
void reshapePositionIds(std::vector<SizeType32> const& positionIdsHost, bool isChatGlm);
|
||||
|
||||
void copyPositionIds(runtime::TllmRuntime const& runtime, std::vector<SizeType32> const& positionIdsHost,
|
||||
bool isChatGlm, TensorPtr const& decoderPositionIds);
|
||||
|
||||
|
||||
@ -118,6 +118,10 @@ public:
|
||||
//! @returns [batchSize, maxBeamWidth], sequence lengths, on gpu
|
||||
[[nodiscard]] TensorPtr getSequenceLengths() const;
|
||||
|
||||
//! @param batchIdx index of the batch
|
||||
//! @returns [maxBeamWidth], sequence lengths for request `batchIdx`, on gpu
|
||||
[[nodiscard]] TensorPtr getSequenceLengths(SizeType32 batchIdx) const;
|
||||
|
||||
//! @brief Get maxTokensPerStep tokens generated in the last forward pass
|
||||
//! @returns [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
|
||||
[[nodiscard]] TensorPtr getAllNewTokens() const;
|
||||
@ -140,6 +144,8 @@ public:
|
||||
//! @returns [maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
|
||||
[[nodiscard]] TensorPtr getFinishedSteps() const;
|
||||
|
||||
[[nodiscard]] SizeType32 getMaxBatchSize() const;
|
||||
|
||||
[[nodiscard]] SizeType32 getMaxBeamWidth() const;
|
||||
|
||||
[[nodiscard]] SizeType32 getMaxSequenceLength() const;
|
||||
|
||||
@ -169,13 +169,12 @@ void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder
|
||||
|
||||
BufferManager manager{std::make_shared<CudaStream>(decoderStream.get())};
|
||||
|
||||
auto const& jointOutputIdsShape = decoderState.getJointDecodingOutput().ids->getShape();
|
||||
auto const batchSize = jointOutputIdsShape.d[0];
|
||||
auto const batchSize = decoderState.getMaxBatchSize();
|
||||
TLLM_CHECK(0 <= batchSize && batchSlot < batchSize);
|
||||
auto const maxBeamWidth = jointOutputIdsShape.d[1];
|
||||
auto const maxBeamWidth = decoderState.getMaxBeamWidth();
|
||||
auto const beamWidth = samplingConfig.beamWidth;
|
||||
TLLM_CHECK_WITH_INFO(beamWidth <= maxBeamWidth,
|
||||
tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (" FMT_DIM ") passed to decoder setup function.",
|
||||
tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (%d) passed to decoder setup function.",
|
||||
beamWidth, maxBeamWidth));
|
||||
auto const& requestIds = request.ids;
|
||||
auto const inputLength = request.inputLen;
|
||||
|
||||
@ -313,18 +313,6 @@ void TransformerBuffers::getBuffers(
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void TransformerBuffers::reshapePositionIds(std::vector<SizeType32> const& positionIdsHost, bool isChatGlm)
|
||||
{
|
||||
if (isChatGlm)
|
||||
{
|
||||
positionIds->reshape(ITensor::makeShape({2, static_cast<int>(positionIdsHost.size()) / 2}));
|
||||
}
|
||||
else
|
||||
{
|
||||
positionIds->reshape(ITensor::makeShape({static_cast<int>(positionIdsHost.size())}));
|
||||
}
|
||||
}
|
||||
|
||||
void TransformerBuffers::copyPositionIds(runtime::TllmRuntime const& runtime,
|
||||
std::vector<SizeType32> const& positionIdsHost, bool isChatGlm, TensorPtr const& decoderPositionIds)
|
||||
{
|
||||
|
||||
@ -1906,13 +1906,12 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(
|
||||
// Make sure that postprocessing is done before copying outputIds
|
||||
mCopyBufferManager.getStream().wait(event.get());
|
||||
|
||||
TensorPtr sequenceLengthView
|
||||
= ITensor::slice(mDecoder->getDecoderState().getJointDecodingOutput().lengths, seqSlot, 1);
|
||||
auto sequenceLengths = mDecoder->getDecoderState().getSequenceLengths(seqSlot);
|
||||
auto outputIds = mDecoder->getDecoderState().getGatheredIds(seqSlot);
|
||||
auto cumLogProbs = mDecoder->getDecoderState().getCumLogProbs(seqSlot);
|
||||
auto logProbs = mDecoder->getDecoderState().getLogProbs(seqSlot);
|
||||
|
||||
mCopyBufferManager.copy(*sequenceLengthView, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
|
||||
mCopyBufferManager.copy(*sequenceLengths, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
|
||||
mCopyBufferManager.copy(*outputIds, *mSlotDecoderBuffers[seqSlot]->outputIds);
|
||||
if (returnLogProbs)
|
||||
{
|
||||
@ -1927,7 +1926,7 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(
|
||||
|
||||
auto const peerSend = 0;
|
||||
mDecSlotAsyncSndHdls.emplace_back(std::make_unique<DecoderSlotAsyncSend>(
|
||||
outputIds, sequenceLengthView, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend));
|
||||
outputIds, sequenceLengths, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend));
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
@ -40,8 +40,7 @@ runtime::CudaEvent UpdateDecoderBuffers::operator()(runtime::ModelConfig const&
|
||||
copyBufferManager.getStream().wait(decoderFinishEvent);
|
||||
|
||||
copyBufferManager.copy(*decoder.getDecoderState().getAllNewTokens(), *decoderOutputBuffers.newOutputTokensHost);
|
||||
copyBufferManager.copy(
|
||||
*decoder.getDecoderState().getJointDecodingOutput().lengths, *decoderOutputBuffers.sequenceLengthsHost);
|
||||
copyBufferManager.copy(*decoder.getDecoderState().getSequenceLengths(), *decoderOutputBuffers.sequenceLengthsHost);
|
||||
|
||||
auto const finishedSumDevice = decoder.getDecoderState().getFinishedSum();
|
||||
copyBufferManager.copy(*finishedSumDevice, *decoderOutputBuffers.finishedSumHost);
|
||||
|
||||
@ -417,75 +417,57 @@ void DecoderState::disableLookahead(RequestVector const& genRequests)
|
||||
|
||||
TensorPtr DecoderState::getFinishedSum() const
|
||||
{
|
||||
return ITensor::slice(mJointDecodingOutput->finishedSum, 0, mMaxBatchSize);
|
||||
return mJointDecodingOutput->finishedSum;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getFinishReasons() const
|
||||
{
|
||||
return ITensor::slice(mJointDecodingOutput->finishReasons, 0, mMaxBatchSize);
|
||||
return mJointDecodingOutput->finishReasons;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getIds() const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
auto tensor = ITensor::slice(mJointDecodingOutput->ids, 0, mMaxBatchSize);
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
return tensor;
|
||||
return mJointDecodingOutput->ids;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getIds(SizeType32 batchIdx) const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
auto tensor = ITensor::slice(mJointDecodingOutput->ids, batchIdx, 1);
|
||||
tensor->squeeze(0);
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
return tensor;
|
||||
return ITensor::at(mJointDecodingOutput->ids, {batchIdx});
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getGatheredIds() const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, 0, mMaxBatchSize);
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
return tensor;
|
||||
return mJointDecodingOutput->gatheredIds;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getGatheredIds(SizeType32 batchIdx) const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, batchIdx, 1);
|
||||
tensor->squeeze(0);
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
return tensor;
|
||||
return ITensor::at(mJointDecodingOutput->gatheredIds, {batchIdx});
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getParentIds() const
|
||||
{
|
||||
return ITensor::slice(mJointDecodingOutput->parentIds, 0, mMaxBatchSize);
|
||||
return mJointDecodingOutput->parentIds;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getCumLogProbs() const
|
||||
{
|
||||
return ITensor::slice(mJointDecodingOutput->cumLogProbs, 0, mMaxBatchSize);
|
||||
return mJointDecodingOutput->cumLogProbs;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getCumLogProbs(SizeType32 batchIdx) const
|
||||
{
|
||||
auto tensor = ITensor::slice(mJointDecodingOutput->cumLogProbs, batchIdx, 1);
|
||||
tensor->squeeze(0);
|
||||
return tensor;
|
||||
return ITensor::at(mJointDecodingOutput->cumLogProbs, {batchIdx});
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getLogProbs() const
|
||||
{
|
||||
return ITensor::slice(mJointDecodingOutput->logProbs, 0, mMaxBatchSize);
|
||||
return mJointDecodingOutput->logProbs;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getLogProbs(SizeType32 batchIdx) const
|
||||
{
|
||||
auto tensor = ITensor::slice(mJointDecodingOutput->logProbs, batchIdx, 1);
|
||||
tensor->squeeze(0);
|
||||
return tensor;
|
||||
return ITensor::at(mJointDecodingOutput->logProbs, {batchIdx});
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getSequenceLengths() const
|
||||
@ -493,6 +475,11 @@ TensorPtr DecoderState::getSequenceLengths() const
|
||||
return mJointDecodingOutput->lengths;
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getSequenceLengths(SizeType32 batchIdx) const
|
||||
{
|
||||
return ITensor::at(mJointDecodingOutput->lengths, {batchIdx});
|
||||
}
|
||||
|
||||
TensorPtr DecoderState::getAllNewTokens() const
|
||||
{
|
||||
return mJointDecodingOutput->newTokensSteps;
|
||||
@ -528,6 +515,11 @@ TensorPtr DecoderState::getFinishedSteps() const
|
||||
return mFinishedSteps;
|
||||
}
|
||||
|
||||
SizeType32 DecoderState::getMaxBatchSize() const
|
||||
{
|
||||
return mMaxBatchSize;
|
||||
}
|
||||
|
||||
SizeType32 DecoderState::getMaxBeamWidth() const
|
||||
{
|
||||
return mMaxBeamWidth;
|
||||
|
||||
@ -208,8 +208,7 @@ void GptDecoderBatched::prepareForward(
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
auto const& jointOutputIdsShape = mDecoderState->getJointDecodingOutput().ids->getShape();
|
||||
auto const maxBeamWidth = jointOutputIdsShape.d[1];
|
||||
auto const maxBeamWidth = mDecoderState->getMaxBeamWidth();
|
||||
auto const speculativeDecodingMode = mDecoderState->getSpeculativeDecodingMode();
|
||||
|
||||
auto& dInput = mDecoderState->getJointDecodingInput();
|
||||
|
||||
@ -48,7 +48,7 @@ void scatterTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth,
|
||||
void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, CudaStream const& stream);
|
||||
|
||||
void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output,
|
||||
std::vector<TensorPtr> const& inputVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
|
||||
std::vector<TensorPtr> const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
|
||||
SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream,
|
||||
int stepOffset);
|
||||
|
||||
|
||||
@ -325,7 +325,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
|
||||
auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
|
||||
samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager);
|
||||
auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
|
||||
*decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
|
||||
*decoder.getDecoderState().getSequenceLengths(), manager);
|
||||
|
||||
std::vector<decoder_batch::Request> decoderRequests;
|
||||
newRequests(
|
||||
@ -333,7 +333,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
auto expectedLengths = tiledInputLengths;
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
|
||||
auto const& finished = getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager);
|
||||
EXPECT_EQ(finished.size(), batchSize);
|
||||
@ -352,7 +352,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
|
||||
decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots, decoderInputs.srcCacheIndirection);
|
||||
decoder.forward(outputs, *inputs);
|
||||
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
EXPECT_THAT(
|
||||
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));
|
||||
|
||||
@ -363,7 +363,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
|
||||
advanceSequenceLengths(expectedLengths, acceptedTokensPerStep, samplingConfigs,
|
||||
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), batchSize, maxBeamWidth);
|
||||
decoder.forward(outputs, *inputs);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
EXPECT_THAT(
|
||||
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(true));
|
||||
|
||||
@ -371,7 +371,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
|
||||
maxSeqLength, inputTokenId, expectedTokenId, endId);
|
||||
|
||||
EXPECT_NO_THROW(decoder.forward(outputs, *inputs));
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
|
||||
TensorPtr batchSlotsView = ITensor::slice(inputBuffers.setupBatchSlots, 0, 1);
|
||||
std::vector<SamplingConfig> singleConfig = {samplingConfigs[0]};
|
||||
@ -454,7 +454,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
|
||||
auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
|
||||
samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager);
|
||||
auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
|
||||
*decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
|
||||
*decoder.getDecoderState().getSequenceLengths(), manager);
|
||||
|
||||
std::vector<SizeType32> const expectedSteps(batchSize, 0);
|
||||
auto expectedLengths = tiledInputLengths;
|
||||
@ -480,7 +480,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
|
||||
|
||||
advanceSequenceLengths(
|
||||
expectedLengths, acceptedTokensPerStep, samplingConfigs, expectedFinished, batchIdx + 1, maxBeamWidth);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
|
||||
for (auto bi = 0; bi <= batchIdx; ++bi)
|
||||
{
|
||||
@ -504,7 +504,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
|
||||
|
||||
advanceSequenceLengths(
|
||||
expectedLengths, acceptedTokensPerStep, samplingConfigs, expectedFinished, batchSize, maxBeamWidth);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
|
||||
for (auto bi = 0; bi < batchSize; ++bi)
|
||||
{
|
||||
@ -599,7 +599,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
|
||||
auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
|
||||
samplingConfigs, generatedTokensPerSteps, false, manager);
|
||||
auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
|
||||
*decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
|
||||
*decoder.getDecoderState().getSequenceLengths(), manager);
|
||||
|
||||
auto batchSlotsRange = BufferRange<SizeType32>(*inputBuffers.setupBatchSlots);
|
||||
std::iota(batchSlotsRange.begin(), batchSlotsRange.end(), 0);
|
||||
@ -609,7 +609,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
auto expectedLengths = tiledInputLengths;
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
|
||||
auto const& finished = getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager);
|
||||
EXPECT_EQ(finished.size(), batchSize);
|
||||
@ -627,7 +627,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
|
||||
auto inputs = tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(activeSlots, decoder.getDecoderState(),
|
||||
decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots, decoderInputs.srcCacheIndirection);
|
||||
decoder.forward(outputs, *inputs);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
|
||||
checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
|
||||
EXPECT_THAT(
|
||||
getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user