Runtime

bufferManager.h

namespace tensorrt_llm
namespace runtime
class BufferManager
#include <bufferManager.h>

A helper class for managing memory on host and device.

Public Types

using IBufferPtr = IBuffer::UniquePtr
using ITensorPtr = ITensor::UniquePtr
using CudaStreamPtr = std::shared_ptr<CudaStream>

Public Functions

explicit BufferManager(CudaStreamPtr stream)

Construct a BufferManager.

Parameters:

cudaStream[in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size on the GPU.

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions on the GPU.

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an IBuffer of the given size and memory type.

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const

Allocates an ITensor of the given dimensions and memory type.

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty IBuffer of the given memory type. It may be resized later.

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const

Create an empty ITensor of the given memory type. It may be reshaped later.

void setZero(IBuffer &buffer) const

Set the contents of the given buffer to zero.

void copy(void const *src, IBuffer &dst, MemoryType srcType) const

Copy src to dst.

void copy(IBuffer const &src, void *dst, MemoryType dstType) const

Copy src to dst.

inline void copy(void const *src, IBuffer &dst) const

Copy src to dst.

inline void copy(IBuffer const &src, void *dst) const

Copy src to dst.

void copy(IBuffer const &src, IBuffer &dst) const

Copy src to dst.

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const

Copy src into a new IBuffer with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const

Copy src into a new ITensor with a potentially different memory type.

CudaStream const &getStream() const

Get the underlying cuda stream.

std::size_t memoryPoolReserved() const

The current size of the memory reserved by the memory pool.

std::size_t memoryPoolUsed() const

The current size of the memory used by the memory pool.

std::size_t memoryPoolFree() const

The current size of the memory free in the memory pool.

void memoryPoolTrimTo(std::size_t size)

Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

Public Static Functions

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an IBuffer of the given size on the CPU.

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates an ITensor of the given dimensions on the CPU.

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned IBuffer of the given size on the CPU.

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)

Allocates a pinned ITensor of the given dimensions on the CPU.

Public Static Attributes

static constexpr auto kBYTE_TYPE = nvinfer1::DataType::kUINT8

Private Members

CudaStreamPtr mStream

Private Static Functions

static void initMemoryPool(int device)
static std::size_t memoryPoolReserved(int device)
static std::size_t memoryPoolUsed(int device)
static inline std::size_t memoryPoolFree(int device)
static void memoryPoolTrimTo(int device, std::size_t size)

common.h

namespace tensorrt_llm
namespace runtime

Typedefs

using SizeType = std::int32_t
using TokenIdType = std::int32_t
template<typename T>
using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

cudaEvent.h

namespace tensorrt_llm
namespace runtime
class CudaEvent

Public Types

using pointer = cudaEvent_t

Public Functions

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

Creates a new cuda event. The event will be destroyed in the destructor.

Parameters:

flags – Flags for event creation. By default, event timing is disabled.

inline explicit CudaEvent(pointer event, bool ownsEvent = true)

Pass an existing cuda event to this object.

Parameters:
  • event – The event to pass to this object.

  • ownsEvent – Whether this object owns the event and destroys it in the destructor.

inline pointer get() const

Returns the event associated with this object.

inline void synchronize() const

Synchronizes the event.

Private Types

using element_type = std::remove_pointer_t<pointer>
using EventPtr = std::unique_ptr<element_type, Deleter>

Private Members

EventPtr mEvent
class Deleter

Public Functions

inline explicit Deleter(bool ownsEvent)
inline explicit Deleter()
inline constexpr void operator()(pointer event) const

Private Members

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm
namespace runtime
class CudaStream

Public Functions

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

Parameters:
  • flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.

  • priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

Pass an existing cuda stream to this object.

Parameters:
  • stream – The stream to pass to this object.

  • device – The device on which the stream was created.

  • ownsStream – Whether this object owns the stream and destroys it in the destructor.

inline int getDevice() const

Returns the device on which the stream was created.

inline cudaStream_t get() const

Returns the stream associated with this object.

inline void synchronize() const

Synchronizes the stream.

inline void record(CudaEvent::pointer event) const

Record an event on the stream.

inline void record(CudaEvent const &event) const

Record an event on the stream.

inline void wait(CudaEvent::pointer event) const

Wait for an event.

inline void wait(CudaEvent const &event) const

Wait for an event.

Private Types

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>

Private Members

StreamPtr mStream
int mDevice = {-1}
class Deleter

Public Functions

inline explicit Deleter(bool ownsStream)
inline explicit Deleter()
inline constexpr void operator()(cudaStream_t stream) const

Private Members

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm
namespace runtime
class DecodingInput

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline DecodingInput(SizeType maxLength, SizeType maxKvCacheLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)

Public Members

SizeType step
SizeType maxLength
SizeType maxKvCacheLength
SizeType batchSize
TensorPtr logits
TensorPtr endIds
TensorPtr finished
TensorPtr sequenceLimitLength
TensorPtr embeddingBias
TensorPtr lengths
TensorPtr badWordsList
TensorPtr stopWordsList
TensorPtr noRepeatNgramSize
TensorPtr cacheIndirection

decodingOutput.h

namespace tensorrt_llm
namespace runtime
class DecodingOutput

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit DecodingOutput(TensorPtr ids)

Public Members

TensorPtr ids
TensorPtr newTokensSteps
TensorPtr newTokens
std::vector<TensorPtr> newTokensVec
TensorPtr finishedSteps
TensorPtr finished
TensorPtr finishedSum
TensorPtr logProbs
TensorPtr cumLogProbs
TensorPtr parentIds
TensorPtr lengths
TensorPtr cacheIndirection
BeamHypotheses beamHypotheses

Public Static Attributes

static constexpr float kNegativeInfinity = -1e20f
class BeamHypotheses

Public Functions

void empty(BufferManager &manager)
void reshape(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
void release()
void init(BufferManager &manager, TokenIdType endId)
BeamHypotheses slice(SizeType batchIndex, SizeType size) const

Public Members

TensorPtr outputIdsTgt
TensorPtr sequenceLengthsTgt
TensorPtr cumLogProbs
TensorPtr normedScores
TensorPtr logProbs
TensorPtr minNormedScores
TensorPtr numBeams
TensorPtr isDone

generationInput.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor, typename PromptTuningParams>
class GenericGenerationInput

Public Types

using TensorPtr = TTensor

Public Functions

inline explicit GenericGenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

Public Members

SizeType endId
SizeType padId
TensorPtr ids
TensorPtr lengths
bool packed
TensorPtr embeddingBias
TensorPtr badWordsList
TensorPtr stopWordsList
std::optional<SizeType> maxNewTokens
PromptTuningParams promptTuningParams
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

Public Types

using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

generationOutput.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor>
class GenericGenerationOutput

Public Types

using TensorPtr = TTensor
using Callback = std::function<void(TensorPtr const &ids, SizeType step, bool finished)>

Public Functions

inline explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)

Public Members

TensorPtr ids
TensorPtr lengths
TensorPtr cumLogProbs
TensorPtr logProbs
TensorPtr contextLogits
TensorPtr generationLogits
Callback onTokenGenerated
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>

Public Types

using Base = GenericGenerationOutput<ITensor::SharedPtr>
using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)

gptDecoder.h

namespace tensorrt_llm
namespace layers
namespace runtime
class IGptDecoder

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

Public Functions

virtual ~IGptDecoder() = default
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) = 0
virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) = 0

Public Static Functions

static void acceptTokens(const ITensor &targetTokenIds, const ITensor &draftTokenIds, const ITensor &contextLengths, const ITensor &numDraftTokens, ITensor &sequenceLengths, const ITensor &finishedVec, ITensor &finishedFinal, ITensor &finishedSum, BufferManager::CudaStreamPtr const &stream)
static inline std::unique_ptr<IGptDecoder> create(nvinfer1::DataType dtype, size_t vocabSize, size_t vocabSizePadded, BufferManager::CudaStreamPtr const &stream)
template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder

Public Types

using CudaStreamPtr = BufferManager::CudaStreamPtr
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

GptDecoder(size_t vocabSize, size_t vocabSizePadded, CudaStreamPtr const &stream)
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) override
virtual bool forward(DecodingOutput &output, DecodingInput const &input) override
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) override

Private Members

BufferManager mManager
common::CudaAllocator mAllocator
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
TensorPtr mLogProbsTiled

gptDecoderBatch.h

namespace tensorrt_llm
namespace runtime
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
#include <gptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = ITensor::SharedPtr

Public Functions

GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, nvinfer1::DataType dtype) override

Setup the decoder before calling forward()

virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) override

Initialize the decoder at batchIdx with a new request.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override

Initialize the decoder with new batch of inputs.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &e) override

Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override

Run one step for all requests without blocking the host thread.

virtual void forwardSync() override

Wait for the last call to forwardAsync to complete.

inline virtual std::vector<bool> getFinished() const override
Returns:

[batchSize], indicators of finished requests

inline virtual TensorPtr getOutputIds(SizeType batchIdx) const override
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

inline virtual TensorPtr getOutputIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu

virtual CudaEvent finalize(SizeType batchIdx) const

Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual void finalize() const override

Gather final beam search results for all requests.

inline virtual TensorPtr getParentIds() const override
Returns:

[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getCumLogProbs() const override
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getCumLogProbs(SizeType batchIdx) const
Returns:

[maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs() const override
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs(SizeType batchIdx) const
Returns:

[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getAllNewTokens() const override

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual TensorPtr getNewTokens(SizeType iter = 0) const override

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

inline virtual std::vector<SizeType> getNbSteps() const override
Returns:

[batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override
Returns:

[1], number of finished sequences, in pinned host memory

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>
using DecodingInputPtr = std::unique_ptr<DecodingInput>
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType batchIdx) const

Gather final beam search results for request batchIdx.

Private Members

std::size_t const mVocabSize
std::size_t const mVocabSizePadded
CudaStreamPtr mStream
BufferManager mBufferManager
TokenPtr mForwardToken
CudaEvent mForwardEvent
std::vector<CudaStreamPtr> mStreams
std::vector<GptDecoderPtr> mDecoders
std::vector<DecodingInputPtr> mDecodingInputs
std::vector<DecodingOutputPtr> mDecodingOutputs
DecodingInputPtr mJointDecodingInput
DecodingOutputPtr mJointDecodingOutput
std::vector<TensorPtr> mDraftTokenIds
TensorPtr mNumDraftTokens
std::vector<SizeType> mNbSteps
std::vector<bool> mFinished
TensorPtr mFinishedSum
std::vector<SizeType> mMaxNewTokens
std::vector<SizeType> mBeamWidths
std::vector<SizeType> mGeneratedTokensPerStep
SizeType mMaxSequenceLength = {}
SizeType mMaxKvCacheLength = {}
SizeType mActualBatchSize = {}
SizeType mMaxTokensPerStep = {}

gptJsonConfig.h

namespace tensorrt_llm
namespace runtime
class GptJsonConfig

Public Functions

inline GptJsonConfig(std::string name, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)
inline GptModelConfig getModelConfig() const
inline std::string const &getName() const
inline std::string const &getPrecision() const
inline constexpr SizeType getTensorParallelism() const
inline constexpr SizeType getPipelineParallelism() const
inline constexpr SizeType getWorldSize() const
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
inline std::string engineFilename(WorldConfig const &worldConfig) const

Public Static Functions

static GptJsonConfig parse(std::string const &json)
static GptJsonConfig parse(std::istream &json)
static GptJsonConfig parse(std::filesystem::path const &path)

Private Members

std::string const mName
std::string const mPrecision
SizeType const mTensorParallelism
SizeType const mPipelineParallelism
GptModelConfig const mGptModelConfig

gptModelConfig.h

namespace tensorrt_llm
namespace runtime
class GptModelConfig

Public Types

enum class ModelVariant : std::int32_t

Values:

enumerator kGpt
enumerator kGlm

Public Functions

inline explicit constexpr GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
inline constexpr SizeType getVocabSize() const noexcept
inline constexpr SizeType getVocabSizePadded(SizeType worldSize) const noexcept
inline constexpr SizeType getNbLayers(SizeType pipelineParallelism = 1) const
inline constexpr SizeType getNbHeads() const noexcept
inline constexpr SizeType getNbKvHeads() const noexcept
inline constexpr void setNbKvHeads(SizeType nbKvHeads) noexcept
inline constexpr SizeType getHiddenSize() const noexcept
inline constexpr SizeType getSizePerHead() const noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr bool useGptAttentionPlugin() const noexcept
inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
inline constexpr bool usePackedInput() const noexcept
inline constexpr void usePackedInput(bool inputPacked) noexcept
inline constexpr bool usePagedKvCache() const noexcept
inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept
inline constexpr SizeType getTokensPerBlock() const noexcept
inline constexpr void setTokensPerBlock(SizeType TokensPerBlock) noexcept
inline constexpr common::QuantMode getQuantMode() const noexcept
inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept
inline constexpr bool supportsInflightBatching() const noexcept
inline constexpr SizeType getMaxBatchSize() const noexcept
inline constexpr void setMaxBatchSize(SizeType maxBatchSize) noexcept
inline constexpr SizeType getMaxInputLen() const noexcept
inline constexpr void setMaxInputLen(SizeType maxInputLen) noexcept
inline constexpr SizeType getMaxOutputLen() const noexcept
inline constexpr void setMaxOutputLen(SizeType maxOutputLen) noexcept
inline constexpr std::optional<SizeType> getMaxNumTokens() const noexcept
inline constexpr void setMaxNumTokens(std::optional<SizeType> maxNumTokens) noexcept
inline constexpr bool usePromptTuning() const noexcept
inline constexpr SizeType getMaxPromptEmbeddingTableSize() const noexcept
inline constexpr void setMaxPromptEmbeddingTableSize(SizeType maxPromptEmbeddingTableSize) noexcept
inline constexpr bool computeContextLogits() const noexcept
inline constexpr void computeContextLogits(bool computeContextLogits) noexcept
inline constexpr bool computeGenerationLogits() const noexcept
inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept
inline ModelVariant getModelVariant() const
inline void setModelVariant(ModelVariant modelVariant)
inline constexpr bool useCustomAllReduce() const noexcept
inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept
inline constexpr void setMaxDraftLen(SizeType maxDraftLen) noexcept
inline constexpr SizeType getMaxTokensPerStep() const noexcept

Private Members

SizeType mVocabSize
SizeType mNbLayers
SizeType mNbHeads
SizeType mNbKvHeads
SizeType mHiddenSize
nvinfer1::DataType mDataType
bool mUseGptAttentionPlugin
bool mInputPacked
bool mPagedKvCache
SizeType mTokensPerBlock
common::QuantMode mQuantMode
SizeType mMaxBatchSize
SizeType mMaxInputLen
SizeType mMaxOutputLen
std::optional<SizeType> mMaxNumTokens
bool mComputeContextLogits
bool mComputeGenerationLogits
ModelVariant mModelVariant
bool mUseCustomAllReduce
SizeType mMaxPromptEmbeddingTableSize
SizeType mMaxDraftLen

gptSession.h

namespace tensorrt_llm
namespace batch_manager
namespace kv_cache_manager
namespace runtime
class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
nvinfer1::ILogger &getLogger() const
BufferManager const &getBufferManager() const
inline GptModelConfig const &getModelConfig() const
inline WorldConfig const &getWorldConfig() const
inline int getDevice() const noexcept
void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)

Private Types

using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
using TensorPtr = runtime::ITensor::SharedPtr
using TokenGeneratedCallback = std::function<void(SizeType step, bool finished)>

Private Functions

inline bool useCudaGraphs()
void generateBatched(std::vector<GenerationOutput> &microBatchesOutputs, std::vector<GenerationInput> const &microBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated)
void setup(Config const &sessionConfig)
void createContexts(SizeType numBatchesCtx, SizeType numBatchesGen, bool useCudaGraphs)
void createBuffers(SizeType numMicroBatches)
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches)
void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength, KvCacheConfig const &config)
void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
void executeContextStep(std::vector<GenerationInput> const &microBatches, std::vector<SizeType> const &microBatchOffsets, KvCacheManager const *kvCacheManager)
SizeType executeGenerationStep(SizeType step, std::vector<GenerationInput> const &microBatchesInputs, std::vector<GenerationOutput> &microBatchesOutputs, std::vector<SizeType> const &microBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> &microBatchesFinished)
void decoderStepAsync(SizeType decoderStep, SizeType microBatchId)

Execute decoder on last PP rank, receive decoder output on other PP ranks.

bool shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId)

Synchronize with the decoder and return the shouldStop flag.

void finalize(SizeType microBatchId)

Collect final output ids and log probs on last PP rank and send them to first PP rank.

Receives are asynchronous on host, so synchronization is required before access.

void kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, SizeType firstBatchIdx)
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType microBatchId) const

Populate outputIds and return reference to newTokens tensor.

TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)

Private Members

GptModelConfig const mModelConfig
WorldConfig const mWorldConfig
int mDevice = {-1}
std::shared_ptr<NcclCommunicator> mPipelineComm
std::shared_ptr<CudaStream> mCommStream
CudaEvent mCommEvent = {}
ITensor::SharedPtr mCommPtrs
std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles
SizeType mDecoderMaxSequenceLength = {}
SizeType mDecoderMaxKvCacheLength = {}
LoggerPtr mLogger
std::shared_ptr<TllmRuntime> mRuntime
std::shared_ptr<KvCacheManager> mKvCacheManager
MicroBatchConfig mMicroBatchConfig
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
std::vector<CudaEvent> mReceivedEvents
bool mCudaGraphMode = {false}
std::vector<CudaGraphExecutor> mCudaGraphInstances

Friends

friend class batch_manager::TrtGptModelV1
class Config
#include <gptSession.h>

Configuration for session execution and buffer sizes. generate may be called with batch size and beam width smaller than the configured parameters.

maxBatchSize will be divided by the number of micro batches to initialize each batch buffer.

Public Functions

inline Config(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength)

Public Members

SizeType maxBatchSize
SizeType maxBeamWidth
SizeType maxSequenceLength
bool decoderPerRequest = {false}
bool cudaGraphMode = {false}
KvCacheConfig kvCacheConfig = {}
std::optional<SizeType> ctxMicroBatchSize = std::nullopt
std::optional<SizeType> genMicroBatchSize = std::nullopt
class CudaGraphExecutor

Public Functions

CudaGraphExecutor() = default
inline ~CudaGraphExecutor()
inline bool hasInstance()
void clear()
void prepareNextGraph(TllmRuntime const &runtime, SizeType nextContextId)
void launch(CudaStream const &stream)

Private Functions

void create(cudaGraph_t const &graph)
bool update(cudaGraph_t const &graph)
void uploadToStream(CudaStream const &stream)

Private Members

cudaGraphExec_t mInstance
class MicroBatchConfig

Public Functions

inline MicroBatchConfig()
explicit MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism, std::optional<SizeType> genMicroBatchSize, std::optional<SizeType> ctxMicroBatchSize)
inline constexpr SizeType numCtxPerGen() const
inline constexpr SizeType getCtxContextId(SizeType generationBatchId, SizeType contextBatchId) const

First 2 * numGenBatches contexts are for generation phase, next numCtxBatches are for context phase. Use numCtxPerGen() contexts for the context batches of each generation batch.

inline constexpr SizeType getGenContextId(SizeType flipFlopId, SizeType generationBatchId) const

First 2 * numGenBatches contexts are for generation phase, flip-flop between 2 of them for each generation batch.

Public Members

SizeType numCtxBatches
SizeType numGenBatches
SizeType ctxBatchSize
SizeType genBatchSize
namespace utils

Functions

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<>
struct MemoryTypeString<MemoryType::kGPU>

Public Static Attributes

static constexpr auto value = "GPU"
template<>
struct MemoryTypeString<MemoryType::kCPU>

Public Static Attributes

static constexpr auto value = "CPU"
template<>
struct MemoryTypeString<MemoryType::kPINNED>

Public Static Attributes

static constexpr auto value = "PINNED"
template<>
struct DataTypeTraits<nvinfer1::DataType::kFLOAT>

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kHALF>

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT8>

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32>

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64>

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true>

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"
static constexpr auto size = sizeof(type)
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true>

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"
static constexpr auto size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"
static constexpr auto size = sizeof(type)
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"
static constexpr auto size = sizeof(type)
template<>
struct TRTDataType<std::int8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<>
struct TRTDataType<std::int32_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<>
struct TRTDataType<std::uint32_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<>
struct TRTDataType<std::int64_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<>
struct TRTDataType<std::uint64_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<>
struct TRTDataType<std::uint8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
namespace tensorrt_llm
namespace runtime

Typedefs

template<typename T>
using PointerElementType = typename std::remove_reference_t<T>::element_type

Enums

enum class MemoryType : std::int32_t

Values:

enumerator kGPU
enumerator kCPU
enumerator kPINNED

Functions

template<typename T>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept
template<typename T, typename D>
std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept
template<typename T>
T const *bufferCast(IBuffer const &buffer)
template<typename T>
T *bufferCast(IBuffer &buffer)
std::ostream &operator<<(std::ostream &output, IBuffer const &buffer)

Utility function to print a buffer.

template<MemoryType T>
struct MemoryTypeString
template<> kGPU >

Public Static Attributes

static constexpr auto value = "GPU"
template<> kCPU >

Public Static Attributes

static constexpr auto value = "CPU"
template<> kPINNED >

Public Static Attributes

static constexpr auto value = "PINNED"
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits
#include <iBuffer.h>

For converting a TensorRT data type to a C++ data type.

template<> kFLOAT >

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"
static constexpr auto size = sizeof(type)
template<> kHALF >

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"
static constexpr auto size = sizeof(type)
template<> kINT8 >

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"
static constexpr auto size = sizeof(type)
template<> kINT32 >

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"
static constexpr auto size = sizeof(type)
template<> kINT64 >

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"
static constexpr auto size = sizeof(type)
template<> kINT32, true >

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"
static constexpr auto size = sizeof(type)
template<> kINT64, true >

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"
static constexpr auto size = sizeof(type)
template<bool kUnsigned> kBOOL, kUnsigned >

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"
static constexpr auto size = sizeof(type)
template<bool kUnsigned> kUINT8, kUnsigned >

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"
static constexpr auto size = sizeof(type)
template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>

Public Types

using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*

Public Static Attributes

static constexpr char name[] = "*"
static constexpr auto size = sizeof(type)
class BufferDataType
#include <iBuffer.h>

A wrapper around nvinfer1::DataType that provides a support for pointer types.

Public Functions

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)
inline constexpr operator nvinfer1::DataType() const noexcept
inline constexpr nvinfer1::DataType getDataType() const noexcept
inline constexpr bool isPointer() const noexcept
inline constexpr bool isUnsigned() const
inline constexpr std::size_t getSize() const noexcept

Public Static Attributes

static constexpr auto kTrtPointerType = nvinfer1::DataType::kINT64

Private Members

nvinfer1::DataType mDataType
bool mUnsigned
bool mPointer
template<typename T, bool = false>
struct TRTDataType
#include <iBuffer.h>

For converting a C++ data type to a TensorRT data type.

template<>
struct TRTDataType<float>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kFLOAT
template<>
struct TRTDataType<half>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kHALF
template<> int8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8
template<> int32_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32
template<> uint32_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
template<> int64_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64
template<> uint64_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
template<>
struct TRTDataType<bool>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kBOOL
template<> uint8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8
template<>
struct TRTDataType<void*>

Public Static Attributes

static constexpr auto value = BufferDataType::kTrtPointerType
template<typename T>
struct TRTDataType<T*>

Public Static Attributes

static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

Private Static Attributes

static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}
class IBuffer

Subclassed by tensorrt_llm::runtime::ITensor

Public Types

using UniquePtr = std::unique_ptr<IBuffer>
using SharedPtr = std::shared_ptr<IBuffer>
using UniqueConstPtr = std::unique_ptr<IBuffer const>
using SharedConstPtr = std::shared_ptr<IBuffer const>
using DataType = nvinfer1::DataType

Public Functions

virtual void *data() = 0

Returns a pointer to underlying array.

virtual void const *data() const = 0

Returns a pointer to underlying array.

inline virtual void *data(std::size_t index)

Returns a pointer to the underlying array at a given element index.

inline virtual void const *data(std::size_t index) const

Returns a pointer to the underlying array at a given element index.

virtual std::size_t getSize() const = 0

Returns the size (in number of elements) of the buffer.

inline virtual std::size_t getSizeInBytes() const

Returns the size (in bytes) of the buffer.

virtual std::size_t getCapacity() const = 0

Returns the capacity of the buffer.

virtual DataType getDataType() const = 0

Returns the data type of the buffer.

virtual char const *getDataTypeName() const
virtual MemoryType getMemoryType() const = 0

Returns the memory type of the buffer.

virtual char const *getMemoryTypeName() const
virtual void resize(std::size_t newSize) = 0

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

virtual void release() = 0

Releases the buffer. It will be reset to nullptr.

virtual ~IBuffer() = default
IBuffer(IBuffer const&) = delete

Not allowed to copy.

IBuffer &operator=(IBuffer const&) = delete

Not allowed to copy.

Public Static Functions

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.

Parameters:
  • buffer – The buffer to view.

  • offset – The offset of the view.

  • size – The size of the view.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently resized.

Parameters:

tensor – The tensor to view.

Returns:

A view on the tensor.

static inline UniquePtr view(SharedPtr tensor, std::size_t size)

Returns a view on the underlying tensor with a different size.

Parameters:
  • tensor – The tensor to view.

  • size – The size of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)

Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.

Parameters:
  • data – The data to wrap.

  • type – The data type of the data.

  • size – The size of the buffer.

  • capacity – The capacity of the buffer.

Returns:

An IBuffer.

static inline UniquePtr wrap(void *data, DataType type, std::size_t size)
template<typename T>
static inline UniquePtr wrap(T *data, std::size_t size, std::size_t capacity)
template<typename T>
static inline UniquePtr wrap(T *data, std::size_t size)
template<typename T>
static inline UniquePtr wrap(std::vector<T> &v)
static MemoryType memoryType(void const *data)

Determine the memory type of a pointer.

Protected Functions

IBuffer() = default
inline std::size_t toBytes(std::size_t size) const

Returns an array index or size in bytes.

template<typename T>
class BufferRange

Public Types

using value_type = T
using size_type = std::size_t
using reference = value_type&
using const_reference = value_type const&
using pointer = T*
using const_pointer = T const*
using iterator = pointer
using const_iterator = const_pointer

Public Functions

inline explicit BufferRange(IBuffer &buffer)
inline iterator begin()
inline iterator end()
inline const_iterator begin() const
inline const_iterator end() const
inline const_iterator cbegin()
inline const_iterator cend()
inline const_iterator cbegin() const
inline const_iterator cend() const
inline size_type size() const
inline reference operator[](size_type index)
inline const_reference operator[](size_type index) const

Private Members

T *mData
size_type mSize

iGptDecoderBatch.h

namespace tensorrt_llm
namespace runtime
class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
#include <iGptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::GptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = std::shared_ptr<ITensor>
using TokenPtr = std::unique_ptr<decoder_batch::Token const>

Public Functions

virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) = 0

Initialize the decoder at batchIdx with a new request.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0

Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &token) = 0

Wait for the call to forwardAsync associated with a token to complete.

inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)

Run one step for all requests and wait for completion on the host.

virtual TensorPtr getOutputIds(SizeType batchIdx) const = 0
Parameters:

batchIdx – index of the batch

Returns:

[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual CudaEvent finalize(SizeType batchIdx) const = 0

Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual std::vector<bool> getFinished() const = 0
Returns:

[batchSize (actual)], marks finished requests (per batch)

virtual TensorPtr getCumLogProbs() const = 0
Returns:

[batchSize, beamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getCumLogProbs(SizeType batchIdx) const = 0
Returns:

[beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getLogProbs() const = 0
Returns:

[batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs(SizeType batchIdx) const = 0
Returns:

[beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getParentIds() const = 0
virtual std::vector<SizeType> getNbSteps() const = 0

Protected Functions

IGptDecoderBatch() = default
namespace decoder_batch

Typedefs

using Output = decoder::Output
class Request

Public Types

using ConstTensorPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr
using BufferPtr = IBuffer::SharedPtr

Public Functions

inline explicit Request(ConstTensorPtr ids, SizeType inputLen, std::optional<SizeType> maxNewTokens = std::nullopt, std::optional<SizeType> endId = std::nullopt)
inline SizeType generatedTokensPerStep() const

Public Members

ConstTensorPtr ids
SizeType inputLen
std::optional<SizeType> maxNewTokens
std::optional<SizeType> endId
BufferPtr draftTokens
TensorPtr embeddingBias
TensorPtr badWordsList
TensorPtr stopWordsList
bool computeCumLogProbs
bool computeLogProbs
class Input

Public Types

using TensorConstPtr = ITensor::SharedConstPtr
using TensorPtr = ITensor::SharedPtr

Public Functions

inline explicit Input(std::vector<TensorConstPtr> const &logits, std::vector<bool> const &active)
inline explicit Input(std::vector<TensorConstPtr> const &logits)
inline explicit Input(std::vector<TensorPtr> const &logits, std::vector<bool> const &active)
inline explicit Input(std::vector<TensorPtr> const &logits)

Public Members

std::vector<TensorConstPtr> logits
std::vector<bool> active
TensorConstPtr cacheIndirection
class Token

Public Functions

inline explicit Token(CudaEvent &&event, std::vector<bool> const &active)

Public Members

CudaEvent event
std::vector<bool> active

iStatefulGptDecoder.h

namespace tensorrt_llm
namespace runtime
class IStatefulGptDecoder
#include <iStatefulGptDecoder.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::IGptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>
using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxKvCacheLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, nvinfer1::DataType dtype) = 0

Setup the decoder before calling forward(), also calls reshapeBuffers.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) = 0

Initialize the decoder with new batch of inputs.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0

Run one step for all requests without blocking the host thread.

virtual void forwardSync() = 0

Wait for the last call to forwardAsync to complete.

inline virtual void forward(decoder::Output &output, decoder::Input const &input)

Run one step for all requests.

virtual void finalize() const = 0

Gather final beam search results for all requests.

virtual TensorPtr getOutputIds() const = 0
Returns:

[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu

virtual TensorPtr getCumLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs() const = 0
Returns:

[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

virtual TensorPtr getNewTokens(SizeType iter = 0) const = 0

Get tokens generated in one step of last forward pass.

Parameters:

iter – The iteration within [0; maxTokensPerStep) for which to get the tokens

Returns:

[batchSize, beamWidth], tokens generated in iter (per beam), on gpu

virtual TensorPtr getAllNewTokens() const = 0

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:

[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

virtual TensorPtr getNbFinished() const = 0
Returns:

[1], number of finished sequences, in pinned host memory

virtual ~IStatefulGptDecoder() = default

Protected Functions

IStatefulGptDecoder() = default
namespace decoder
class Input

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline explicit Input(TensorPtr logits)

Public Members

TensorPtr logits
TensorPtr cacheIndirection
class Output

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

Output() = default

Public Members

TensorPtr cacheIndirection
TensorPtr sequenceLengths

iTensor.h

namespace nvinfer1
namespace tensorrt_llm
namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims)

Utility function to print a shape.

std::ostream &operator<<(std::ostream &output, ITensor const &tensor)

Utility function to print a tensor with its shape.

class ITensor : public virtual tensorrt_llm::runtime::IBuffer

Public Types

using UniquePtr = std::unique_ptr<ITensor>
using SharedPtr = std::shared_ptr<ITensor>
using UniqueConstPtr = std::unique_ptr<ITensor const>
using SharedConstPtr = std::shared_ptr<ITensor const>
using Shape = nvinfer1::Dims
using DimType = std::remove_reference_t<decltype(Shape::d[0])>

Public Functions

~ITensor() override = default
virtual Shape const &getShape() const = 0

Returns the tensor dimensions.

virtual void reshape(Shape const &dims) = 0

Sets the tensor dimensions. The new size of the tensor will be volume(dims)

inline virtual void resize(std::size_t newSize) override

Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

ITensor(ITensor const&) = delete

Not allowed to copy.

ITensor &operator=(ITensor const&) = delete

Not allowed to copy.

inline void squeeze(SizeType dim)

Removes the given unit dimensions from this tensor.

inline void unsqueeze(SizeType dim)

Adds a unit dimension at the specified position.

Public Static Functions

static inline std::int64_t volume(Shape const &dims)

Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

static inline std::size_t volumeNonNegative(Shape const &shape)

Returns the volume of the dimensions. Throws if d.nbDims < 0.

static Shape squeeze(Shape const &shape, SizeType dim)

Removes the given unit dimension from shape.

Parameters:
  • shape – The shape to squeeze.

  • dim – The dimension that should be removed (“squeezed”).

Returns:

A new shape without the unit dimension.

static Shape unsqueeze(Shape const &shape, SizeType dim)

Add a unit dimension to shape at the specified position.

Parameters:
  • shape – The shape to unsqueeze.

  • dim – The dimension where unit dimension should be added.

Returns:

A new shape with the added unit dimension.

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

Parameters:
  • tensor – The tensor to view.

  • offset – The offset of the view w.r.t. dimension 0 of the tensor.

  • size – The size of the view w.r.t. dimension 0 of the tensor.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)

Returns a view on the underlying buffer (or tensor) with the given shape.

Parameters:
  • tensor – The tensor to view.

  • shape – The shape of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently reshaped.

Parameters:

tensor – The tensor to view.

Returns:

A view on the tensor.

static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

Parameters:
  • data – The data to wrap.

  • type – The data type of the data.

  • shape – The shape of the tensor.

  • capacity – The capacity of the buffer.

Returns:

An ITensor.

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape)
template<typename T>
static inline UniquePtr wrap(std::vector<T> &v, Shape const &shape)
static Shape makeShape(std::initializer_list<SizeType> const &dims)

A convenience function to create a tensor shape with the given dimensions.

static std::string toString(Shape const &dims)

A convenience function for converting a tensor shape to a string.

Protected Functions

ITensor() = default

Protected Static Functions

static inline DimType castSize(size_t newSize)

ipcUtils.h

namespace tensorrt_llm
namespace runtime

Functions

void setPeerAccess(WorldConfig worldConfig, bool enable = true)
class IpcMemory

Public Types

using TensorPtr = ITensor::SharedPtr

Public Functions

IpcMemory(WorldConfig worldConfig, std::size_t bufferSize)
~IpcMemory()
inline const std::vector<void*> &getCommPtrsTensor() const

Public Static Attributes

static constexpr size_t FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t)

Private Functions

void allocateIpcMemory()
void destroyIpcMemory()

Private Members

WorldConfig mWorldConfig
std::vector<void*> mCommPtrs
std::size_t mBufferSize
void *mBufferPtr

memoryCounters.h

namespace tensorrt_llm
namespace runtime
class MemoryCounters

Public Types

using SizeType = std::size_t
using DiffType = std::ptrdiff_t

Public Functions

MemoryCounters() = default
inline SizeType getGpu() const
inline SizeType getCpu() const
inline SizeType getPinned() const
inline DiffType getGpuDiff() const
inline DiffType getCpuDiff() const
inline DiffType getPinnedDiff() const
template<MemoryType T>
inline void allocate(SizeType size)
void allocate(MemoryType memoryType, SizeType size)
template<MemoryType T>
inline void deallocate(SizeType size)
void deallocate(MemoryType memoryType, SizeType size)
std::string toString() const

Public Static Functions

static inline MemoryCounters &getInstance()
static std::string bytesToString(SizeType bytes, int precision = 2)
static std::string bytesToString(DiffType bytes, int precision = 2)

Private Members

SizeType mGpu = {}
SizeType mCpu = {}
SizeType mPinned = {}
DiffType mGpuDiff = {}
DiffType mCpuDiff = {}
DiffType mPinnedDiff = {}

Private Static Attributes

static thread_local MemoryCounters mInstance

promptTuningParams.h

namespace tensorrt_llm
namespace runtime
template<typename TTensor>
class GenericPromptTuningParams

Public Types

using TensorPtr = TTensor
using SizeType = tensorrt_llm::runtime::SizeType

Public Functions

inline explicit GenericPromptTuningParams(TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())

Public Members

TensorPtr embeddingTable
TensorPtr tasks
TensorPtr vocabSize
std::vector<bool> promptTuningEnabled
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>

Public Types

using TensorPtr = ITensor::SharedPtr
using SizeType = GenericPromptTuningParams::SizeType

Public Functions

inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)
void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests, const std::vector<SizeType> &reqBeamWidths, const std::vector<SizeType> &reqPromptLengths, BufferManager const &manager, bool packedInput)

samplingConfig.h

namespace tensorrt_llm
namespace runtime
class SamplingConfig

Public Functions

inline explicit SamplingConfig(SizeType beamWidth = 1)

Public Members

SizeType beamWidth
OptVec<FloatType> temperature
OptVec<SizeType> minLength
OptVec<FloatType> repetitionPenalty
OptVec<FloatType> presencePenalty
OptVec<SizeType> topK
OptVec<FloatType> topP
OptVec<unsigned long long> randomSeed
OptVec<FloatType> topPDecay
OptVec<FloatType> topPMin
OptVec<SizeType> topPResetIds
OptVec<FloatType> beamSearchDiversityRate
OptVec<FloatType> lengthPenalty

Private Types

using FloatType = float
template<typename T>
using OptVec = std::optional<std::vector<T>>

tllmLogger.h

namespace tensorrt_llm
namespace runtime
class TllmLogger : public nvinfer1::ILogger

Public Functions

void log(Severity severity, nvinfer1::AsciiChar const *msg) noexcept override
Severity getLevel()
void setLevel(Severity level)

worldConfig.h

namespace tensorrt_llm
namespace runtime
class WorldConfig

Public Functions

inline explicit constexpr WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode)
inline constexpr SizeType getSize() const noexcept
inline constexpr SizeType getTensorParallelism() const noexcept
inline constexpr bool isTensorParallel() const noexcept
inline constexpr SizeType getPipelineParallelism() const noexcept
inline constexpr bool isPipelineParallel() const noexcept
inline constexpr SizeType getRank() const noexcept
inline constexpr SizeType getGpusPerNode() const noexcept
inline constexpr SizeType getDevice() const noexcept
inline constexpr SizeType getPipelineParallelRank() const noexcept
inline constexpr SizeType getTensorParallelRank() const noexcept
inline constexpr bool isFirstPipelineParallelRank() const noexcept
inline constexpr bool isLastPipelineParallelRank() const noexcept

Is my rank the last rank in its pipeline?

inline constexpr SizeType getLastRank() const noexcept
std::vector<SizeType> getPipelineParallelGroup() const

Public Static Functions

static bool validConfig(nvinfer1::ILogger &logger, SizeType tensorParallelism, SizeType pipelineParallelism)
static WorldConfig mpi(nvinfer1::ILogger &logger, SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt)
static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt)

Public Static Attributes

static constexpr SizeType kDefaultGpusPerNode = 8

Private Members

SizeType mTensorParallelism
SizeType mPipelineParallelism
SizeType mRank
SizeType mGpusPerNode