mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* refactor: Restructure DecoderBuffers and DecoderStepAsyncSend - Move communication logic from `DecoderBuffers` to `DecoderStepAsyncSend`. - Updated `DecoderStepAsyncSend` constructor to utilize the `DecoderBuffers`, enhancing clarity and reducing parameter complexity. - Refactored related methods to align with the new class structure, improving maintainability and readability of the code. These changes streamline the handling of decoding buffers and improve the overall architecture of the batch manager. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Restructure SlotDecoderBuffers and DecoderSlotAsyncSend - Move communication logic from `SlotDecoderBuffers` to `DecoderSlotAsyncSend`. - Updated `DecoderSlotAsyncSend` constructor to utilize the `SlotDecoderBuffers`, enhancing clarity and reducing parameter complexity. - Refactored related methods to align with the new class structure, improving maintainability and readability of the code. These changes enhance the structure and readability of the batch manager's decoding process. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * chore: Log DecodingMode Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Introduce DecoderOutputBuffers and update related classes - Moved buffers from `DecoderBuffers` to `DecoderOutputBuffers` to better reflect its purpose. - Updated the `DecoderStepAsyncSend` class to utilize `DecoderOutputBuffers`, enhancing clarity in the communication logic. - Refactored the constructor and methods in `DecoderBuffers` to accommodate the new structure, improving maintainability. - Added Python bindings for `DecoderOutputBuffers` to ensure compatibility with existing interfaces. These changes streamline the handling of output buffers in the decoding process, improving the overall architecture of the batch manager. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Update MPI communicator handling - Changed the `commSession` parameter type from `std::shared_ptr<mpi::MpiComm>` to `mpi::MpiComm` in `DecoderStepAsyncSend` and `DecoderSlotAsyncSend` classes for improved clarity and reduced complexity. - Updated related methods and constructors to reflect the new parameter type, enhancing maintainability. - Refactored the `TrtGptModelInflightBatching` class to accommodate these changes, ensuring consistent usage of `MpiComm`. These modifications streamline the communication logic in the decoding process, improving the overall architecture of the batch manager. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Replace shared_ptr with unique_ptr for buffer management - Updated the `TrtGptModelInflightBatching` class to use `std::unique_ptr` instead of `std::shared_ptr` for various buffer types, including `AllReduceBuffers`, `RuntimeBuffers`, `DecoderBuffers`, and `SlotDecoderBuffers`. - This change enhances memory management and ownership semantics, reducing overhead and improving performance. These modifications contribute to a cleaner and more efficient architecture in the batch manager. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --------- Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
192 lines
7.4 KiB
C++
192 lines
7.4 KiB
C++
/*
|
|
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/runtime/eagleBuffers.h"
|
|
#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
|
|
#include "tensorrt_llm/runtime/iTensor.h"
|
|
#include "tensorrt_llm/runtime/lookaheadBuffers.h"
|
|
#include "tensorrt_llm/runtime/modelConfig.h"
|
|
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
|
|
#include "tensorrt_llm/runtime/worldConfig.h"
|
|
|
|
#include <optional>
|
|
#include <vector>
|
|
|
|
namespace tensorrt_llm::batch_manager
|
|
{
|
|
|
|
class DecoderInputBuffers
|
|
{
|
|
public:
|
|
using SizeType32 = runtime::SizeType32;
|
|
using TensorPtr = runtime::ITensor::SharedPtr;
|
|
|
|
explicit DecoderInputBuffers(
|
|
SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, runtime::BufferManager const& manager);
|
|
|
|
// buffers for setup
|
|
TensorPtr setupBatchSlots;
|
|
TensorPtr inputsIds;
|
|
|
|
// buffers for forward
|
|
TensorPtr forwardBatchSlotsRequestOrder;
|
|
TensorPtr forwardBatchSlotsRequestOrderDevice;
|
|
TensorPtr fillValues;
|
|
TensorPtr fillValuesDevice;
|
|
std::vector<TensorPtr> forwardBatchSlots;
|
|
};
|
|
|
|
class DecoderOutputBuffers
|
|
{
|
|
public:
|
|
using SizeType32 = runtime::SizeType32;
|
|
using TensorPtr = runtime::ITensor::SharedPtr;
|
|
|
|
DecoderOutputBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxSeqLen,
|
|
SizeType32 maxTokensPerStep, runtime::BufferManager const& manager);
|
|
|
|
void enableLookaheadDecoding(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep);
|
|
void disableLookaheadDecoding(SizeType32 maxNumSequences);
|
|
|
|
TensorPtr sequenceLengthsHost; // [mMaxNumRequests, beamWidth], pinned host tensor
|
|
TensorPtr newOutputTokensHost; // [maxTokensPerStep, mMaxNumRequests, beamWidth]
|
|
TensorPtr cumLogProbsHost; // [mMaxNumRequests, beamWidth]
|
|
TensorPtr logProbsHost; // [mMaxNumRequests, beamWidth, maxSeqLen]
|
|
TensorPtr finishedSumHost; // [mMaxNumRequests], pinned host tensor
|
|
TensorPtr finishReasonsHost; // [mMaxNumRequests, beamWidth], pinned host tensor
|
|
};
|
|
|
|
class DecoderBuffers
|
|
{
|
|
public:
|
|
using SizeType32 = runtime::SizeType32;
|
|
using TensorPtr = runtime::ITensor::SharedPtr;
|
|
|
|
std::vector<TensorPtr> logits;
|
|
|
|
TensorPtr cacheIndirectionInput;
|
|
TensorPtr cacheIndirectionOutput;
|
|
|
|
class DraftBuffers
|
|
{
|
|
public:
|
|
TensorPtr nextDraftTokensDevice; // [mMaxNumRequests, maxTokensPerStep-1]
|
|
TensorPtr nextDraftTokensHost; // [mMaxNumRequests, maxTokensPerStep-1]
|
|
TensorPtr prevDraftTokensLengthsDevice; // [mMaxNumRequests]
|
|
TensorPtr prevDraftTokensLengthsHost; // [mMaxNumRequests]
|
|
TensorPtr nextDraftTokensLengthsDevice; // [mMaxNumRequests]
|
|
TensorPtr nextDraftTokensLengthsHost; // [mMaxNumRequests]
|
|
TensorPtr acceptedLengthsCumSumDevice; // [mMaxNumRequests+1]
|
|
TensorPtr acceptedPackedPathsDevice; // [mMaxNumRequests * maxAcceptedTokens]
|
|
std::vector<std::vector<runtime::ITensor::SharedPtr>>
|
|
predictedDraftLogits; // [mMaxNumRequests][mMaxNumHeads][maxDraftTokens + 1, vocabSize]
|
|
|
|
void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const& manager,
|
|
runtime::ModelConfig const& modelConfig);
|
|
};
|
|
|
|
DraftBuffers draftBuffers;
|
|
runtime::ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers;
|
|
runtime::EagleBuffers::Inputs eagleBuffers;
|
|
std::optional<runtime::LookaheadDecodingBuffers> lookaheadBuffers;
|
|
|
|
DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
|
|
SizeType32 maxTokensPerStep, runtime::BufferManager const& manager, runtime::ModelConfig const& modelConfig,
|
|
runtime::WorldConfig const& worldConfig);
|
|
};
|
|
|
|
class DecoderStepAsyncSend
|
|
{
|
|
public:
|
|
using SizeType32 = runtime::SizeType32;
|
|
using BufferPtr = runtime::IBuffer::SharedPtr;
|
|
|
|
DecoderStepAsyncSend(DecoderOutputBuffers const& decoderOutputBuffers, DecoderBuffers const& decoderBuffers,
|
|
bool returnLogProbs, SizeType32 maxBeamWidth, bool useMedusa, mpi::MpiComm const& commSession, int peer);
|
|
|
|
~DecoderStepAsyncSend();
|
|
|
|
static void recv(DecoderOutputBuffers const& decoderOutputBuffers, DecoderBuffers const& decoderBuffers,
|
|
bool returnLogProbs, SizeType32 maxBeamWidth, bool useMedusa, mpi::MpiComm const& commSession, int peer);
|
|
|
|
static void bcast(DecoderOutputBuffers const& decoderOutputBuffers, DecoderBuffers const& decoderBuffers,
|
|
bool returnLogProbs, SizeType32 maxBeamWidth, bool useMedusa, mpi::MpiComm const& commSession, int root);
|
|
|
|
static auto constexpr kMpiTagOffset = 0;
|
|
static auto constexpr kMpiTagUpperBound = kMpiTagOffset + 9;
|
|
|
|
private:
|
|
std::shared_ptr<mpi::MpiRequest> mRequest1;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest2;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest3;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest4;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest5;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest6;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest7;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest8;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest9;
|
|
};
|
|
|
|
class SlotDecoderBuffers
|
|
{
|
|
public:
|
|
using SizeType32 = runtime::SizeType32;
|
|
using TensorPtr = runtime::ITensor::SharedPtr;
|
|
|
|
TensorPtr outputIds; // [beamWidth, maxSeqLen], outputIds of single batch slot
|
|
TensorPtr outputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot
|
|
TensorPtr sequenceLengths; // [beamWidth]
|
|
TensorPtr sequenceLengthsHost; // [beamWidth]
|
|
TensorPtr cumLogProbs; // [beamWidth]
|
|
TensorPtr cumLogProbsHost; // [beamWidth]
|
|
TensorPtr logProbs; // [beamWidth, maxSeqLen]
|
|
TensorPtr logProbsHost; // [beamWidth, maxSeqLen]
|
|
TensorPtr finishReasonsHost; // [beamWidth]
|
|
|
|
SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::BufferManager const& manager);
|
|
};
|
|
|
|
class DecoderSlotAsyncSend
|
|
{
|
|
public:
|
|
using TensorPtr = runtime::ITensor::SharedPtr;
|
|
|
|
DecoderSlotAsyncSend(TensorPtr const& outputIds, TensorPtr const& sequenceLengths, TensorPtr const& cumLogProbs,
|
|
TensorPtr const& logProbs, bool returnLogProbs, mpi::MpiComm const& commSession, int peer);
|
|
|
|
DecoderSlotAsyncSend(
|
|
SlotDecoderBuffers const& slotDecoderBuffers, bool returnLogProbs, mpi::MpiComm const& commSession, int peer);
|
|
|
|
~DecoderSlotAsyncSend();
|
|
|
|
static void recv(
|
|
SlotDecoderBuffers const& slotDecoderBuffers, bool returnLogProbs, mpi::MpiComm const& commSession, int peer);
|
|
|
|
static auto constexpr kMpiTagOffset = 9;
|
|
static auto constexpr kMpiTagUpperBound = kMpiTagOffset + 4;
|
|
static_assert(kMpiTagOffset >= DecoderStepAsyncSend::kMpiTagUpperBound);
|
|
|
|
private:
|
|
std::shared_ptr<mpi::MpiRequest> mRequest1;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest2;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest3;
|
|
std::shared_ptr<mpi::MpiRequest> mRequest4;
|
|
};
|
|
|
|
} // namespace tensorrt_llm::batch_manager
|