TensorRT-LLMs/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
Robin Kobus 2ab71f9a80
refactor: decoder buffers (#3307)
* refactor: remove cumLogProbs and logProbs from DecoderBuffers

- Eliminated cumLogProbs and logProbs from DecoderBuffers, streamlining the buffer management.
- Updated related code in decoderBuffers.cpp and bindings.cpp to reflect these changes, ensuring that only host pointers are used for log probabilities.

These modifications enhance code clarity and maintainability by reducing redundancy in buffer management.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* refactor: streamline sequence length handling in GptDecoderBatched and StatefulGptDecoderBatched

- Updated GptDecoderBatched to directly use output.sequenceLengths for lengths assignment, removing unnecessary reshaping.
- Adjusted StatefulGptDecoderBatched to ensure sequence lengths are correctly shaped based on actual batch size and max beam width.

These changes enhance clarity and maintainability in the decoding process.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* refactor: integrate DecoderState for sequence length management in decoding process

- Updated DecoderBuffers to remove direct handling of sequence lengths, now utilizing DecoderState for this purpose.
- Adjusted MakeDecodingBatchInputOutput to accept DecoderState, enhancing clarity in the decoding input/output management.
- Refactored GptDecoderBatched and StatefulGptDecoderBatched to streamline sequence length handling, ensuring consistency across the decoding workflow.

refactor: update SlotDecoderBuffers to manage sequence lengths directly

- Introduced sequenceLengths and sequenceLengthsHost to SlotDecoderBuffers for better management of sequence lengths.
- Refactored asyncSend and recv methods to utilize the new sequenceLengths member, enhancing clarity and reducing redundancy.
- Updated TrtGptModelInflightBatching to align with the new structure, ensuring consistent handling of sequence lengths across the decoding process.

These changes improve maintainability and streamline the decoding workflow.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

* refactor: Delegate to asyncSend method in SlotDecoderBuffers

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>

---------

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
2025-04-12 11:41:24 +02:00

191 lines
7.9 KiB
C++

/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/runtime/eagleBuffers.h"
#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/lookaheadBuffers.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include <optional>
#include <vector>
namespace tensorrt_llm::batch_manager
{
class DecoderInputBuffers
{
public:
using SizeType32 = runtime::SizeType32;
using TensorPtr = runtime::ITensor::SharedPtr;
explicit DecoderInputBuffers(
SizeType32 maxBatchSize, SizeType32 maxTokensPerEngineStep, runtime::BufferManager const& manager);
// buffers for setup
TensorPtr setupBatchSlots;
TensorPtr inputsIds;
// buffers for forward
TensorPtr forwardBatchSlotsRequestOrder;
TensorPtr forwardBatchSlotsRequestOrderDevice;
TensorPtr fillValues;
TensorPtr fillValuesDevice;
TensorPtr forwardBatchSlots;
};
class DecoderStepAsyncSend
{
public:
using BufferPtr = runtime::IBuffer::SharedPtr;
DecoderStepAsyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, BufferPtr const& newOutputTokensHost,
BufferPtr const& finished, BufferPtr const& sequenceLengthsHost, BufferPtr const& cumLogProbsHost,
BufferPtr const& logProbsHost, BufferPtr const& cacheIndirectionOutput, BufferPtr const& acceptedCumSum,
BufferPtr const& packedPaths, BufferPtr const& finishReasonsHost, int peer);
~DecoderStepAsyncSend();
static auto constexpr kMpiTagOffset = 0;
static auto constexpr kMpiTagUpperBound = kMpiTagOffset + 9;
private:
std::shared_ptr<mpi::MpiRequest> mRequest1;
std::shared_ptr<mpi::MpiRequest> mRequest2;
std::shared_ptr<mpi::MpiRequest> mRequest3;
std::shared_ptr<mpi::MpiRequest> mRequest4;
std::shared_ptr<mpi::MpiRequest> mRequest5;
std::shared_ptr<mpi::MpiRequest> mRequest6;
std::shared_ptr<mpi::MpiRequest> mRequest7;
std::shared_ptr<mpi::MpiRequest> mRequest8;
std::shared_ptr<mpi::MpiRequest> mRequest9;
};
class DecoderSlotAsyncSend
{
public:
using TensorPtr = runtime::ITensor::SharedPtr;
DecoderSlotAsyncSend(std::shared_ptr<mpi::MpiComm> const& commSession, TensorPtr const& outputIdsView,
TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView, TensorPtr const& logProbsView,
bool returnLogProbs, int peer);
~DecoderSlotAsyncSend();
static auto constexpr kMpiTagOffset = 9;
static auto constexpr kMpiTagUpperBound = kMpiTagOffset + 4;
static_assert(kMpiTagOffset >= DecoderStepAsyncSend::kMpiTagUpperBound);
private:
std::shared_ptr<mpi::MpiRequest> mRequest1;
std::shared_ptr<mpi::MpiRequest> mRequest2;
std::shared_ptr<mpi::MpiRequest> mRequest3;
std::shared_ptr<mpi::MpiRequest> mRequest4;
};
class DecoderBuffers
{
public:
using SizeType32 = runtime::SizeType32;
using TensorPtr = runtime::ITensor::SharedPtr;
std::vector<TensorPtr> logits;
TensorPtr slotOutputIds; // [mMaxNumRequests, beamWidth, maxSeqLen], outputIds of all batch slots
TensorPtr slotOutputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot
TensorPtr cacheIndirectionInput;
TensorPtr cacheIndirectionOutput;
TensorPtr sequenceLengthsHost; // [mMaxNumRequests, beamWidth], pinned host tensor
TensorPtr newOutputTokens; // [maxTokensPerStep, mMaxNumRequests, beamWidth]
TensorPtr newOutputTokensHost; // [maxTokensPerStep, mMaxNumRequests, beamWidth]
TensorPtr cumLogProbsHost; // [mMaxNumRequests, beamWidth]
TensorPtr logProbsHost; // [mMaxNumRequests, beamWidth, maxSeqLen]
TensorPtr finishedSumHost; // [mMaxNumRequests], pinned host tensor
TensorPtr finishReasonsHost; // [mMaxNumRequests, beamWidth], pinned host tensor
class DraftBuffers
{
public:
TensorPtr nextDraftTokensDevice; // [mMaxNumRequests, maxTokensPerStep-1]
TensorPtr nextDraftTokensHost; // [mMaxNumRequests, maxTokensPerStep-1]
TensorPtr prevDraftTokensLengthsDevice; // [mMaxNumRequests]
TensorPtr prevDraftTokensLengthsHost; // [mMaxNumRequests]
TensorPtr nextDraftTokensLengthsDevice; // [mMaxNumRequests]
TensorPtr nextDraftTokensLengthsHost; // [mMaxNumRequests]
TensorPtr acceptedLengthsCumSumDevice; // [mMaxNumRequests+1]
TensorPtr acceptedPackedPathsDevice; // [mMaxNumRequests * maxAcceptedTokens]
std::vector<std::vector<runtime::ITensor::SharedPtr>>
predictedDraftLogits; // [mMaxNumRequests][mMaxNumHeads][maxDraftTokens + 1, vocabSize]
void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const& manager,
runtime::ModelConfig const& modelConfig);
};
DraftBuffers draftBuffers;
runtime::ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers;
runtime::EagleBuffers::Inputs eagleBuffers;
std::optional<runtime::LookaheadDecodingBuffers> lookaheadBuffers;
DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, runtime::BufferManager const& manager,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
std::unique_ptr<DecoderStepAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
bool returnLogProbs, SizeType32 maxBeamWidth, bool useMedusa, int peer);
void recv(std::shared_ptr<mpi::MpiComm> const& commSession, bool returnLogProbs, SizeType32 maxBeamWidth,
bool useMedusa, int peer);
void bcast(std::shared_ptr<mpi::MpiComm> const& commSession, bool returnLogProbs, SizeType32 maxBeamWidth,
bool useMedusa, int root);
void enableLookaheadDecoding(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep);
void disableLookaheadDecoding(SizeType32 maxNumSequences);
};
class SlotDecoderBuffers
{
public:
using SizeType32 = runtime::SizeType32;
using TensorPtr = runtime::ITensor::SharedPtr;
TensorPtr outputIds; // [beamWidth, maxSeqLen], outputIds of single batch slot
TensorPtr outputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot
TensorPtr sequenceLengths; // [beamWidth]
TensorPtr sequenceLengthsHost; // [beamWidth]
TensorPtr cumLogProbs; // [beamWidth]
TensorPtr cumLogProbsHost; // [beamWidth]
TensorPtr logProbs; // [beamWidth, maxSeqLen]
TensorPtr logProbsHost; // [beamWidth, maxSeqLen]
TensorPtr finishReasonsHost; // [beamWidth]
SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::BufferManager const& manager);
static std::unique_ptr<DecoderSlotAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
TensorPtr const& outputIdsView, TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView,
TensorPtr const& logProbsView, bool returnLogProbs, int peer);
std::unique_ptr<DecoderSlotAsyncSend> asyncSend(
std::shared_ptr<mpi::MpiComm> const& commSession, bool returnLogProbs, int peer) const;
void recv(std::shared_ptr<mpi::MpiComm> const& commSession, bool returnLogProbs, int peer);
};
} // namespace tensorrt_llm::batch_manager