chore: Clean up cpp runtime (#4449)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-05-28 16:32:59 +02:00 · 2025-05-28 16:32:59 +02:00 · 12763779c4
commit 12763779c4
parent ed3c67e34a
10 changed files with 47 additions and 67 deletions
--- a/cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h
+++ b/cpp/include/tensorrt_llm/batch_manager/transformerBuffers.h
@ -119,8 +119,6 @@ public:

    void getBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::ModelConfig const& modelConfig) const;

-    void reshapePositionIds(std::vector<SizeType32> const& positionIdsHost, bool isChatGlm);
-
    void copyPositionIds(runtime::TllmRuntime const& runtime, std::vector<SizeType32> const& positionIdsHost,
        bool isChatGlm, TensorPtr const& decoderPositionIds);

--- a/cpp/include/tensorrt_llm/runtime/decoderState.h
+++ b/cpp/include/tensorrt_llm/runtime/decoderState.h
@ -118,6 +118,10 @@ public:
    //! @returns [batchSize, maxBeamWidth], sequence lengths, on gpu
    [[nodiscard]] TensorPtr getSequenceLengths() const;

+    //! @param batchIdx index of the batch
+    //! @returns [maxBeamWidth], sequence lengths for request `batchIdx`, on gpu
+    [[nodiscard]] TensorPtr getSequenceLengths(SizeType32 batchIdx) const;
+
    //! @brief Get maxTokensPerStep tokens generated in the last forward pass
    //! @returns [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
    [[nodiscard]] TensorPtr getAllNewTokens() const;
@ -140,6 +144,8 @@ public:
    //! @returns [maxTokensPerStep, batchSize, beamWidth], finished states of type FinishedState, on gpu
    [[nodiscard]] TensorPtr getFinishedSteps() const;

+    [[nodiscard]] SizeType32 getMaxBatchSize() const;
+
    [[nodiscard]] SizeType32 getMaxBeamWidth() const;

    [[nodiscard]] SizeType32 getMaxSequenceLength() const;
--- a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
+++ b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@ -169,13 +169,12 @@ void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder

    BufferManager manager{std::make_shared<CudaStream>(decoderStream.get())};

-    auto const& jointOutputIdsShape = decoderState.getJointDecodingOutput().ids->getShape();
-    auto const batchSize = jointOutputIdsShape.d[0];
+    auto const batchSize = decoderState.getMaxBatchSize();
    TLLM_CHECK(0 <= batchSize && batchSlot < batchSize);
-    auto const maxBeamWidth = jointOutputIdsShape.d[1];
+    auto const maxBeamWidth = decoderState.getMaxBeamWidth();
    auto const beamWidth = samplingConfig.beamWidth;
    TLLM_CHECK_WITH_INFO(beamWidth <= maxBeamWidth,
-        tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (" FMT_DIM ") passed to decoder setup function.",
+        tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (%d) passed to decoder setup function.",
            beamWidth, maxBeamWidth));
    auto const& requestIds = request.ids;
    auto const inputLength = request.inputLen;
--- a/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp
+++ b/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp
@ -313,18 +313,6 @@ void TransformerBuffers::getBuffers(
    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }

-void TransformerBuffers::reshapePositionIds(std::vector<SizeType32> const& positionIdsHost, bool isChatGlm)
-{
-    if (isChatGlm)
-    {
-        positionIds->reshape(ITensor::makeShape({2, static_cast<int>(positionIdsHost.size()) / 2}));
-    }
-    else
-    {
-        positionIds->reshape(ITensor::makeShape({static_cast<int>(positionIdsHost.size())}));
-    }
-}
-
 void TransformerBuffers::copyPositionIds(runtime::TllmRuntime const& runtime,
    std::vector<SizeType32> const& positionIdsHost, bool isChatGlm, TensorPtr const& decoderPositionIds)
 {
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@ -1906,13 +1906,12 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(
        // Make sure that postprocessing is done before copying outputIds
        mCopyBufferManager.getStream().wait(event.get());

-        TensorPtr sequenceLengthView
-            = ITensor::slice(mDecoder->getDecoderState().getJointDecodingOutput().lengths, seqSlot, 1);
+        auto sequenceLengths = mDecoder->getDecoderState().getSequenceLengths(seqSlot);
        auto outputIds = mDecoder->getDecoderState().getGatheredIds(seqSlot);
        auto cumLogProbs = mDecoder->getDecoderState().getCumLogProbs(seqSlot);
        auto logProbs = mDecoder->getDecoderState().getLogProbs(seqSlot);

-        mCopyBufferManager.copy(*sequenceLengthView, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
+        mCopyBufferManager.copy(*sequenceLengths, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
        mCopyBufferManager.copy(*outputIds, *mSlotDecoderBuffers[seqSlot]->outputIds);
        if (returnLogProbs)
        {
@ -1927,7 +1926,7 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(

            auto const peerSend = 0;
            mDecSlotAsyncSndHdls.emplace_back(std::make_unique<DecoderSlotAsyncSend>(
-                outputIds, sequenceLengthView, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend));
+                outputIds, sequenceLengths, cumLogProbs, logProbs, returnLogProbs, *mMpiCommPipelinePara, peerSend));
        }
    }
    else
--- a/cpp/tensorrt_llm/batch_manager/updateDecoderBuffers.cpp
+++ b/cpp/tensorrt_llm/batch_manager/updateDecoderBuffers.cpp
@ -40,8 +40,7 @@ runtime::CudaEvent UpdateDecoderBuffers::operator()(runtime::ModelConfig const&
    copyBufferManager.getStream().wait(decoderFinishEvent);

    copyBufferManager.copy(*decoder.getDecoderState().getAllNewTokens(), *decoderOutputBuffers.newOutputTokensHost);
-    copyBufferManager.copy(
-        *decoder.getDecoderState().getJointDecodingOutput().lengths, *decoderOutputBuffers.sequenceLengthsHost);
+    copyBufferManager.copy(*decoder.getDecoderState().getSequenceLengths(), *decoderOutputBuffers.sequenceLengthsHost);

    auto const finishedSumDevice = decoder.getDecoderState().getFinishedSum();
    copyBufferManager.copy(*finishedSumDevice, *decoderOutputBuffers.finishedSumHost);
--- a/cpp/tensorrt_llm/runtime/decoderState.cpp
+++ b/cpp/tensorrt_llm/runtime/decoderState.cpp
@ -417,75 +417,57 @@ void DecoderState::disableLookahead(RequestVector const& genRequests)

 TensorPtr DecoderState::getFinishedSum() const
 {
-    return ITensor::slice(mJointDecodingOutput->finishedSum, 0, mMaxBatchSize);
+    return mJointDecodingOutput->finishedSum;
 }

 TensorPtr DecoderState::getFinishReasons() const
 {
-    return ITensor::slice(mJointDecodingOutput->finishReasons, 0, mMaxBatchSize);
+    return mJointDecodingOutput->finishReasons;
 }

 TensorPtr DecoderState::getIds() const
 {
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    auto tensor = ITensor::slice(mJointDecodingOutput->ids, 0, mMaxBatchSize);
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return tensor;
+    return mJointDecodingOutput->ids;
 }

 TensorPtr DecoderState::getIds(SizeType32 batchIdx) const
 {
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    auto tensor = ITensor::slice(mJointDecodingOutput->ids, batchIdx, 1);
-    tensor->squeeze(0);
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return tensor;
+    return ITensor::at(mJointDecodingOutput->ids, {batchIdx});
 }

 TensorPtr DecoderState::getGatheredIds() const
 {
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, 0, mMaxBatchSize);
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return tensor;
+    return mJointDecodingOutput->gatheredIds;
 }

 TensorPtr DecoderState::getGatheredIds(SizeType32 batchIdx) const
 {
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    auto tensor = ITensor::slice(mJointDecodingOutput->gatheredIds, batchIdx, 1);
-    tensor->squeeze(0);
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return tensor;
+    return ITensor::at(mJointDecodingOutput->gatheredIds, {batchIdx});
 }

 TensorPtr DecoderState::getParentIds() const
 {
-    return ITensor::slice(mJointDecodingOutput->parentIds, 0, mMaxBatchSize);
+    return mJointDecodingOutput->parentIds;
 }

 TensorPtr DecoderState::getCumLogProbs() const
 {
-    return ITensor::slice(mJointDecodingOutput->cumLogProbs, 0, mMaxBatchSize);
+    return mJointDecodingOutput->cumLogProbs;
 }

 TensorPtr DecoderState::getCumLogProbs(SizeType32 batchIdx) const
 {
-    auto tensor = ITensor::slice(mJointDecodingOutput->cumLogProbs, batchIdx, 1);
-    tensor->squeeze(0);
-    return tensor;
+    return ITensor::at(mJointDecodingOutput->cumLogProbs, {batchIdx});
 }

 TensorPtr DecoderState::getLogProbs() const
 {
-    return ITensor::slice(mJointDecodingOutput->logProbs, 0, mMaxBatchSize);
+    return mJointDecodingOutput->logProbs;
 }

 TensorPtr DecoderState::getLogProbs(SizeType32 batchIdx) const
 {
-    auto tensor = ITensor::slice(mJointDecodingOutput->logProbs, batchIdx, 1);
-    tensor->squeeze(0);
-    return tensor;
+    return ITensor::at(mJointDecodingOutput->logProbs, {batchIdx});
 }

 TensorPtr DecoderState::getSequenceLengths() const
@ -493,6 +475,11 @@ TensorPtr DecoderState::getSequenceLengths() const
    return mJointDecodingOutput->lengths;
 }

+TensorPtr DecoderState::getSequenceLengths(SizeType32 batchIdx) const
+{
+    return ITensor::at(mJointDecodingOutput->lengths, {batchIdx});
+}
+
 TensorPtr DecoderState::getAllNewTokens() const
 {
    return mJointDecodingOutput->newTokensSteps;
@ -528,6 +515,11 @@ TensorPtr DecoderState::getFinishedSteps() const
    return mFinishedSteps;
 }

+SizeType32 DecoderState::getMaxBatchSize() const
+{
+    return mMaxBatchSize;
+}
+
 SizeType32 DecoderState::getMaxBeamWidth() const
 {
    return mMaxBeamWidth;
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@ -208,8 +208,7 @@ void GptDecoderBatched::prepareForward(
 {
    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);

-    auto const& jointOutputIdsShape = mDecoderState->getJointDecodingOutput().ids->getShape();
-    auto const maxBeamWidth = jointOutputIdsShape.d[1];
+    auto const maxBeamWidth = mDecoderState->getMaxBeamWidth();
    auto const speculativeDecodingMode = mDecoderState->getSpeculativeDecodingMode();

    auto& dInput = mDecoderState->getJointDecodingInput();
--- a/cpp/tensorrt_llm/runtime/runtimeKernels.h
+++ b/cpp/tensorrt_llm/runtime/runtimeKernels.h
@ -48,7 +48,7 @@ void scatterTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth,
 void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, CudaStream const& stream);

 void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output,
-    std::vector<TensorPtr> const& inputVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
+    std::vector<TensorPtr> const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
    SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream,
    int stepOffset);

--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@ -325,7 +325,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
    auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
        samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager);
    auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
-        *decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
+        *decoder.getDecoderState().getSequenceLengths(), manager);

    std::vector<decoder_batch::Request> decoderRequests;
    newRequests(
@ -333,7 +333,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
    cudaDeviceSynchronize();

    auto expectedLengths = tiledInputLengths;
-    checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+    checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);

    auto const& finished = getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager);
    EXPECT_EQ(finished.size(), batchSize);
@ -352,7 +352,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
        decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots, decoderInputs.srcCacheIndirection);
    decoder.forward(outputs, *inputs);

-    checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+    checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
    EXPECT_THAT(
        getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));

@ -363,7 +363,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
    advanceSequenceLengths(expectedLengths, acceptedTokensPerStep, samplingConfigs,
        getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), batchSize, maxBeamWidth);
    decoder.forward(outputs, *inputs);
-    checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+    checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
    EXPECT_THAT(
        getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(true));

@ -371,7 +371,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
        maxSeqLength, inputTokenId, expectedTokenId, endId);

    EXPECT_NO_THROW(decoder.forward(outputs, *inputs));
-    checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+    checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);

    TensorPtr batchSlotsView = ITensor::slice(inputBuffers.setupBatchSlots, 0, 1);
    std::vector<SamplingConfig> singleConfig = {samplingConfigs[0]};
@ -454,7 +454,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
    auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
        samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager);
    auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
-        *decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
+        *decoder.getDecoderState().getSequenceLengths(), manager);

    std::vector<SizeType32> const expectedSteps(batchSize, 0);
    auto expectedLengths = tiledInputLengths;
@ -480,7 +480,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo

        advanceSequenceLengths(
            expectedLengths, acceptedTokensPerStep, samplingConfigs, expectedFinished, batchIdx + 1, maxBeamWidth);
-        checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+        checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);

        for (auto bi = 0; bi <= batchIdx; ++bi)
        {
@ -504,7 +504,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo

        advanceSequenceLengths(
            expectedLengths, acceptedTokensPerStep, samplingConfigs, expectedFinished, batchSize, maxBeamWidth);
-        checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+        checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);

        for (auto bi = 0; bi < batchSize; ++bi)
        {
@ -599,7 +599,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
    auto decoderInputs = createDecoderInputs(batchSize, maxBeamWidth, maxSeqLength, vocabSizePadded, dataType,
        samplingConfigs, generatedTokensPerSteps, false, manager);
    auto outputs = createDecoderOutputs(batchSize, maxBeamWidth, maxSeqLength, tiledInputLengths,
-        *decoder.getDecoderState().getJointDecodingOutput().lengths, manager);
+        *decoder.getDecoderState().getSequenceLengths(), manager);

    auto batchSlotsRange = BufferRange<SizeType32>(*inputBuffers.setupBatchSlots);
    std::iota(batchSlotsRange.begin(), batchSlotsRange.end(), 0);
@ -609,7 +609,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
    cudaDeviceSynchronize();

    auto expectedLengths = tiledInputLengths;
-    checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+    checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);

    auto const& finished = getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager);
    EXPECT_EQ(finished.size(), batchSize);
@ -627,7 +627,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
    auto inputs = tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(activeSlots, decoder.getDecoderState(),
        decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots, decoderInputs.srcCacheIndirection);
    decoder.forward(outputs, *inputs);
-    checkSequenceLengths(*decoder.getDecoderState().getJointDecodingOutput().lengths, expectedLengths, manager);
+    checkSequenceLengths(*decoder.getDecoderState().getSequenceLengths(), expectedLengths, manager);
    EXPECT_THAT(
        getFinished(*decoder.getDecoderState().getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));