Runtime

bufferManager.h

namespace tensorrt_llm

namespace runtime

class BufferManager

#include <bufferManager.h>

A helper class for managing memory on host and device.

Public Types

using IBufferPtr = IBuffer::UniquePtr 

using ITensorPtr = ITensor::UniquePtr 

using CudaStreamPtr = std::shared_ptr<CudaStream>

Public Functions

explicit BufferManager(CudaStreamPtr stream)

Construct a BufferManager.

Parameters:: cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an IBuffer of the given size on the GPU.

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an ITensor of the given dimensions on the GPU.

IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an IBuffer of the given size in UVM.

ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an ITensor of the given dimensions in UVM.

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an IBuffer of the given size and memory type.

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an ITensor of the given dimensions and memory type.

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: Create an empty IBuffer of the given memory type. It may be resized later.

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: Create an empty ITensor of the given memory type. It may be reshaped later.

void setZero(IBuffer &buffer) const: Set the contents of the given buffer to zero.

void copy(void const *src, IBuffer &dst, MemoryType srcType) const: Copy src to dst.

void copy(IBuffer const &src, void *dst, MemoryType dstType) const: Copy src to dst.

inline void copy(void const *src, IBuffer &dst) const: Copy src to dst.

inline void copy(IBuffer const &src, void *dst) const: Copy src to dst.

void copy(IBuffer const &src, IBuffer &dst) const: Copy src to dst.

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const: Copy src into a new IBuffer with a potentially different memory type.

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

template<typename T> inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const: Copy src into a new IBuffer with a potentially different memory type.

template<typename T> inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

template<typename T> inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

CudaStream const &getStream() const: Get the underlying cuda stream.

std::size_t memoryPoolReserved() const: The current size of the memory reserved by the memory pool.

std::size_t memoryPoolUsed() const: The current size of the memory used by the memory pool.

std::size_t memoryPoolFree() const: The current size of the memory free in the memory pool.

void memoryPoolTrimTo(std::size_t size): Try to trim the memory reserved by the pool to size bytes. This synchronizes implicitly with the stream.

Public Static Functions

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates an IBuffer of the given size on the CPU.

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates an ITensor of the given dimensions on the CPU.

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned IBuffer of the given size on the CPU.

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned ITensor of the given dimensions on the CPU.

static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned IBuffer of the given size on the CPU in the default memory pool.

static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned ITensor of the given dimensions on the CPU in the default memory pool.

Public Static Attributes

static constexpr auto kBYTE_TYPE = nvinfer1::DataType::kUINT8

Private Members

CudaStreamPtr mStream

Private Static Functions

static void initMemoryPool(int device)

static std::size_t memoryPoolReserved(int device)

static std::size_t memoryPoolUsed(int device)

static inline std::size_t memoryPoolFree(int device)

static void memoryPoolTrimTo(int device, std::size_t size)

common.h

namespace tensorrt_llm

namespace runtime

Typedefs

using SizeType = std::int32_t

using TokenIdType = std::int32_t

template<typename T> using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

cudaEvent.h

namespace tensorrt_llm

namespace runtime

class CudaEvent

Public Types

using pointer = cudaEvent_t

Public Functions

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

Creates a new cuda event. The event will be destroyed in the destructor.

Parameters:: flags – Flags for event creation. By default, event timing is disabled.

inline explicit CudaEvent(pointer event, bool ownsEvent = true)

Pass an existing cuda event to this object.

Parameters:

event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.

inline pointer get() const: Returns the event associated with this object.

inline void synchronize() const: Synchronizes the event.

Private Types

using element_type = std::remove_pointer_t<pointer>

using EventPtr = std::unique_ptr<element_type, Deleter>

Private Members

EventPtr mEvent

class Deleter

Public Functions

inline explicit Deleter(bool ownsEvent)

inline explicit Deleter()

inline constexpr void operator()(pointer event) const

Private Members

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm

namespace runtime

class CudaStream

Public Functions

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

Parameters:

flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

Pass an existing cuda stream to this object.

Parameters:

stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.

inline int getDevice() const: Returns the device on which the stream was created.

inline cudaStream_t get() const: Returns the stream associated with this object.

inline void synchronize() const: Synchronizes the stream.

inline void record(CudaEvent::pointer event) const: Record an event on the stream.

inline void record(CudaEvent const &event) const: Record an event on the stream.

inline void wait(CudaEvent::pointer event) const: Wait for an event.

inline void wait(CudaEvent const &event) const: Wait for an event.

Private Types

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>

Private Members

StreamPtr mStream

int mDevice = {-1}

class Deleter

Public Functions

inline explicit Deleter(bool ownsStream)

inline explicit Deleter()

inline constexpr void operator()(cudaStream_t stream) const

Private Members

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm

namespace runtime

class DecodingInput

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline DecodingInput(SizeType maxLength, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxBatchSize, TensorPtr logits, TensorPtr endIds)

Public Members

SizeType step

SizeType maxLength

SizeType maxAttentionWindow

SizeType sinkTokenLength

SizeType maxBatchSize

TensorPtr logits

TensorPtr endIds

TensorPtr finished

TensorPtr sequenceLimitLength

TensorPtr embeddingBias

TensorPtr lengths

TensorPtr badWordsList

TensorPtr stopWordsList

TensorPtr noRepeatNgramSize

TensorPtr batchSlots

TensorPtr cacheIndirection

decodingOutput.h

namespace tensorrt_llm

namespace runtime

class DecodingOutput

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

inline explicit DecodingOutput(TensorPtr ids)

Public Members

TensorPtr ids

TensorPtr newTokensSteps

TensorPtr newTokens

std::vector<TensorPtr> newTokensVec

TensorPtr finished

TensorPtr finishedSum

TensorPtr logProbs

TensorPtr cumLogProbs

TensorPtr parentIds

TensorPtr lengths

TensorPtr cacheIndirection

BeamHypotheses beamHypotheses

Public Static Attributes

static constexpr float kNegativeInfinity = -1e20f

class BeamHypotheses

Public Functions

void empty(BufferManager &manager)

void reshape(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)

void release()

void init(BufferManager &manager, TokenIdType endId)

BeamHypotheses slice(SizeType batchIndex, SizeType size) const

Public Members

TensorPtr outputIdsTgt

TensorPtr sequenceLengthsTgt

TensorPtr cumLogProbs

TensorPtr normedScores

TensorPtr logProbs

TensorPtr minNormedScores

TensorPtr numBeams

TensorPtr isDone

generationInput.h

namespace tensorrt_llm

namespace runtime

template<typename TTensor, typename PromptTuningParams> class GenericGenerationInput

Public Types

using TensorPtr = TTensor 

Public Functions

inline explicit GenericGenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

Public Members

SizeType endId

SizeType padId

TensorPtr ids

TensorPtr lengths

bool packed

TensorPtr embeddingBias

TensorPtr badWordsList

TensorPtr stopWordsList

std::optional<SizeType> maxNewTokens

PromptTuningParams promptTuningParams

class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

Public Types

using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>

using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

generationOutput.h

namespace tensorrt_llm

namespace runtime

template<typename TTensor> class GenericGenerationOutput

Public Types

using TensorPtr = TTensor 

using Callback = std::function<void(TensorPtr const &ids, SizeType step, bool finished)>

Public Functions

inline explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)

Public Members

TensorPtr ids

TensorPtr lengths

TensorPtr cumLogProbs

TensorPtr logProbs

TensorPtr contextLogits

TensorPtr generationLogits

Callback onTokenGenerated

class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>

Public Types

using Base = GenericGenerationOutput<ITensor::SharedPtr>

using TensorPtr = Base::TensorPtr

Public Functions

inline explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)

gptDecoder.h

namespace tensorrt_llm

namespace layers

namespace runtime

class IGptDecoder

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

Public Functions

virtual ~IGptDecoder() = default

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) = 0

virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0

virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0

virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) = 0

virtual const SamplingConfig &getSamplingConfig() = 0

Public Static Functions

static void acceptDraftTokensByIds(const ITensor &targetTokenIds, const ITensor &draftTokenIds, const ITensor &contextLengths, const ITensor &numDraftTokens, ITensor &sequenceLengths, const ITensor &finishedVec, ITensor &finishedFinal, ITensor &finishedSum, BufferManager::CudaStreamPtr const &stream)

static void acceptDraftTokensByLogits(ITensor &draftLogits, const ITensor &targetLogits, ITensor &draftProbs, ITensor &targetProbs, const ITensor &numDraftTokens, ITensor &finished, SizeType vocabSize, SizeType vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, curandState_t *curandState, BufferManager::CudaStreamPtr const &stream)

static inline std::unique_ptr<IGptDecoder> create(nvinfer1::DataType dtype, size_t maxBatchSize, size_t vocabSize, size_t vocabSizePadded, BufferManager::CudaStreamPtr const &stream)

template<typename T> class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder 

Public Types

using CudaStreamPtr = BufferManager::CudaStreamPtr 

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

GptDecoder(size_t maxBatchSize, size_t vocabSize, size_t vocabSizePadded, CudaStreamPtr const &stream)

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) override

virtual bool forward(DecodingOutput &output, DecodingInput const &input) override

virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override

virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) override

inline virtual const SamplingConfig &getSamplingConfig() override

Private Members

BufferManager mManager

std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer

TensorPtr mLogProbsTiled

SamplingConfig mSamplingConfig

gptDecoderBatch.h

namespace tensorrt_llm

namespace runtime

class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch 

#include <gptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = ITensor::SharedPtr 

Public Functions

GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)

virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, nvinfer1::DataType dtype) override: Setup the decoder before calling forward()

virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) override: Initialize the decoder at batchIdx with a new request.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override: Initialize the decoder with new batch of inputs.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override: Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &e) override: Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override: Run one step for all requests without blocking the host thread.

virtual void forwardSync() override: Wait for the last call to forwardAsync to complete.

inline virtual std::vector<bool> getFinished() const override

Returns:: [batchSize], indicators of finished requests

inline virtual TensorPtr getOutputIds(SizeType batchIdx) const override

Parameters:: batchIdx – index of the batch
Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

inline virtual TensorPtr getOutputIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu

virtual CudaEvent finalize(SizeType batchIdx) const: Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual void finalize() const override: Gather final beam search results for all requests.

inline virtual TensorPtr getParentIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getCumLogProbs() const override

Returns:: [batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getCumLogProbs(SizeType batchIdx) const

Returns:: [maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs() const override

Returns:: [batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getLogProbs(SizeType batchIdx) const

Returns:: [maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

inline virtual TensorPtr getAllNewTokens() const override

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:: [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual TensorPtr getNewTokens(SizeType iter = 0) const override

Get tokens generated in one step of last forward pass.

Parameters:: iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
Returns:: [batchSize, beamWidth], tokens generated in iter (per beam), on gpu

inline virtual std::vector<SizeType> getNbSteps() const override

Returns:: [batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override

Returns:: [1], number of finished sequences, in pinned host memory

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>

using DecodingInputPtr = std::unique_ptr<DecodingInput>

using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType batchIdx) const: Gather final beam search results for request batchIdx.

Private Members

std::size_t const mVocabSize

std::size_t const mVocabSizePadded

CudaStreamPtr mStream

BufferManager mBufferManager

TokenPtr mForwardToken

CudaEvent mForwardEvent

std::vector<CudaStreamPtr> mStreams

std::vector<GptDecoderPtr> mDecoders

std::vector<DecodingInputPtr> mDecodingInputs

std::vector<DecodingOutputPtr> mDecodingOutputs

DecodingInputPtr mJointDecodingInput

DecodingOutputPtr mJointDecodingOutput

std::vector<TensorPtr> mDraftTokenIds

std::vector<TensorPtr> mDraftLogits

std::vector<bool> mAcceptByLogits

TensorPtr mNumDraftTokens

TensorPtr mCurandStates

std::vector<SizeType> mNbSteps

std::vector<bool> mFinished

TensorPtr mFinishedSum

std::vector<SizeType> mMaxNewTokens

std::vector<SizeType> mBeamWidths

std::vector<SizeType> mGeneratedTokensPerStep

TensorPtr mFinishedSteps

TensorPtr mDraftProbs

TensorPtr mTargetProbs

SizeType mMaxSequenceLength = {}

SizeType mMaxAttentionWindow = {}

SizeType mSinkTokenLength = {}

SizeType mActualBatchSize = {}

SizeType mMaxTokensPerStep = {}

gptJsonConfig.h

namespace tensorrt_llm

namespace runtime

class GptJsonConfig

Public Functions

inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)

inline GptModelConfig getModelConfig() const

inline std::string const &getName() const

inline std::string const &getVersion() const

inline std::string const &getPrecision() const

inline constexpr SizeType getTensorParallelism() const

inline constexpr SizeType getPipelineParallelism() const

inline constexpr SizeType getWorldSize() const

std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const

inline std::string engineFilename(WorldConfig const &worldConfig) const

Public Static Functions

static GptJsonConfig parse(std::string const &json)

static GptJsonConfig parse(std::istream &json)

static GptJsonConfig parse(std::filesystem::path const &path)

Private Members

std::string const mName

std::string const mVersion

std::string const mPrecision

SizeType const mTensorParallelism

SizeType const mPipelineParallelism

GptModelConfig const mGptModelConfig

gptModelConfig.h

namespace tensorrt_llm

namespace runtime

class GptModelConfig

Public Types

enum class ModelVariant : std::int32_t

Values:

enumerator kGpt

enumerator kGlm

Public Functions

inline explicit GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)

inline constexpr SizeType getVocabSize() const noexcept

inline constexpr SizeType getVocabSizePadded(SizeType worldSize) const noexcept

inline constexpr SizeType getNbLayers(SizeType pipelineParallelism = 1) const

inline constexpr SizeType getNbHeads() const noexcept

inline constexpr SizeType getNbKvHeads() const noexcept

inline constexpr void setNbKvHeads(SizeType nbKvHeads) noexcept

inline constexpr SizeType getHiddenSize() const noexcept

inline constexpr SizeType getSizePerHead() const noexcept

inline constexpr void setSizePerHead(SizeType sizePerHead) noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr bool useGptAttentionPlugin() const noexcept

inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept

inline constexpr bool usePackedInput() const noexcept

inline constexpr void usePackedInput(bool inputPacked) noexcept

inline constexpr bool usePagedKvCache() const noexcept

inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept

inline constexpr SizeType getTokensPerBlock() const noexcept

inline constexpr void setTokensPerBlock(SizeType TokensPerBlock) noexcept

inline constexpr common::QuantMode getQuantMode() const noexcept

inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept

inline constexpr bool supportsInflightBatching() const noexcept

inline constexpr SizeType getMaxBatchSize() const noexcept

inline constexpr void setMaxBatchSize(SizeType maxBatchSize) noexcept

inline constexpr SizeType getMaxBeamWidth() const noexcept

inline constexpr void setMaxBeamWidth(SizeType maxBeamWidth) noexcept

inline constexpr SizeType getMaxInputLen() const noexcept

inline constexpr void setMaxInputLen(SizeType maxInputLen) noexcept

inline constexpr SizeType getMaxSequenceLen() const noexcept

inline constexpr void setMaxSequenceLen(SizeType maxSequenceLen) noexcept

inline constexpr std::optional<SizeType> getMaxNumTokens() const noexcept

inline constexpr void setMaxNumTokens(std::optional<SizeType> maxNumTokens) noexcept

inline constexpr bool usePromptTuning() const noexcept

inline constexpr SizeType getMaxPromptEmbeddingTableSize() const noexcept

inline constexpr void setMaxPromptEmbeddingTableSize(SizeType maxPromptEmbeddingTableSize) noexcept

inline constexpr bool computeContextLogits() const noexcept

inline constexpr void computeContextLogits(bool computeContextLogits) noexcept

inline constexpr bool computeGenerationLogits() const noexcept

inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept

inline ModelVariant getModelVariant() const

inline void setModelVariant(ModelVariant modelVariant)

inline constexpr bool useCustomAllReduce() const noexcept

inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept

inline constexpr void setMaxDraftLen(SizeType maxDraftLen) noexcept

inline SizeType getMaxDraftLen() const

inline constexpr SizeType getMaxTokensPerStep() const noexcept

inline constexpr void setUseContextFMHAForGeneration(bool useContextFMHAForGeneration) noexcept

inline constexpr bool getContextFMHAForGeneration() const noexcept

inline constexpr void setPagedContextFMHA(bool pagedContextFMHA) noexcept

inline constexpr bool getPagedContextFMHA() const noexcept

inline constexpr bool useLoraPlugin() const noexcept

inline constexpr void useLoraPlugin(bool useLoraPlugin) noexcept

inline std::vector<LoraModule> const &getLoraModules() const noexcept

inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept

inline constexpr SizeType getMlpHiddenSize() const noexcept

inline constexpr void setMlpHiddenSize(SizeType mlpHiddenSize) noexcept

inline constexpr SizeType getMaxLoraRank() const noexcept

inline constexpr void setMaxLoraRank(SizeType maxLoraRank) noexcept

Private Members

SizeType mVocabSize

SizeType mNbLayers

SizeType mNbHeads

SizeType mNbKvHeads

SizeType mHiddenSize

SizeType mSizePerHead

nvinfer1::DataType mDataType

bool mUseGptAttentionPlugin

bool mInputPacked

bool mPagedKvCache

SizeType mTokensPerBlock

common::QuantMode mQuantMode

SizeType mMaxBatchSize

SizeType mMaxBeamWidth

SizeType mMaxInputLen

SizeType mMaxSequenceLen

std::optional<SizeType> mMaxNumTokens

bool mComputeContextLogits

bool mComputeGenerationLogits

ModelVariant mModelVariant

bool mUseCustomAllReduce

SizeType mMaxPromptEmbeddingTableSize

SizeType mMaxDraftLen

bool mUseContextFMHAForGeneration

bool mPagedContextFMHA

bool mUseLoraPlugin

std::vector<LoraModule> mLoraModules

SizeType mMlpHiddenSize

SizeType mMaxLoraRank

gptSession.h

namespace tensorrt_llm

namespace batch_manager

namespace kv_cache_manager

namespace runtime

class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)

inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)

inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)

nvinfer1::ILogger &getLogger() const

BufferManager const &getBufferManager() const

inline GptModelConfig const &getModelConfig() const

inline WorldConfig const &getWorldConfig() const

inline int getDevice() const noexcept

nvinfer1::DataType getLogitDataType() const

void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)

Private Types

using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager

using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig

using TensorPtr = runtime::ITensor::SharedPtr 

using TokenGeneratedCallback = std::function<void(SizeType step, bool finished)>

Private Functions

inline bool useCudaGraphs()

void generateBatched(std::vector<GenerationOutput> &microBatchesOutputs, std::vector<GenerationInput> const &microBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated)

void setup(Config const &sessionConfig)

void createContexts()

void createBuffers(SizeType numMicroBatches)

void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches)

void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, KvCacheConfig const &config)

void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)

void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType> const &generationBatchesOffsets, KvCacheManager const *kvCacheManager)

SizeType executeGenerationStep(SizeType step, std::vector<GenerationInput> const &microBatchesInputs, std::vector<GenerationOutput> &microBatchesOutputs, std::vector<SizeType> const &microBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> &microBatchesFinished)

void decoderStepAsync(SizeType decoderStep, SizeType microBatchId): Execute decoder on last PP rank, receive decoder output on other PP ranks.

bool shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId): Synchronize with the decoder and return the shouldStop flag.

void finalize(SizeType microBatchId)

Collect final output ids and log probs on last PP rank and send them to first PP rank.

Receives are asynchronous on host, so synchronization is required before access.

void kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, SizeType firstBatchIdx)

ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType microBatchId) const: Populate outputIds and return reference to newTokens tensor.

TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)

Private Members

GptModelConfig const mModelConfig

WorldConfig const mWorldConfig

int mDevice = {-1}

std::shared_ptr<NcclCommunicator> mPipelineComm

std::shared_ptr<CudaStream> mCommStream

CudaEvent mCommEvent = {}

ITensor::SharedPtr mCommPtrs

std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles

SizeType mDecoderMaxSequenceLength = {}

SizeType mDecoderMaxAttentionWindow = {}

SizeType mDecoderSinkTokenLength = {}

LoggerPtr mLogger

std::shared_ptr<TllmRuntime> mRuntime

std::shared_ptr<KvCacheManager> mKvCacheManager

MicroBatchConfig mMicroBatchConfig

std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders

std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers

std::vector<CudaEvent> mReceivedEvents

bool mCudaGraphMode = {false}

std::vector<CudaGraphExecutor> mCudaGraphInstances

Friends

friend class batch_manager::TrtGptModelV1

class Config

#include <gptSession.h>

Configuration for session execution and buffer sizes. generate may be called with batch size and beam width smaller than the configured parameters.

maxBatchSize will be divided by the number of micro batches to initialize each batch buffer.

Public Functions

inline Config(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength)

Public Members

SizeType maxBatchSize

SizeType maxBeamWidth

SizeType maxSequenceLength

bool decoderPerRequest = {false}

bool cudaGraphMode = {false}

KvCacheConfig kvCacheConfig = {}

std::optional<SizeType> ctxMicroBatchSize = std::nullopt

std::optional<SizeType> genMicroBatchSize = std::nullopt

class CudaGraphExecutor

Public Functions

CudaGraphExecutor() = default

inline ~CudaGraphExecutor()

inline bool hasInstance()

void clear()

void prepareNextGraph(TllmRuntime const &runtime, SizeType nextContextId)

void launch(CudaStream const &stream)

Private Functions

void create(cudaGraph_t const &graph)

bool update(cudaGraph_t const &graph)

void uploadToStream(CudaStream const &stream)

Private Members

cudaGraphExec_t mInstance

class MicroBatchConfig

Public Functions

inline MicroBatchConfig()

explicit MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism, std::optional<SizeType> genMicroBatchSize, std::optional<SizeType> ctxMicroBatchSize)

inline constexpr SizeType numCtxPerGen() const

inline constexpr SizeType getGenGraphId(SizeType flipFlopId, SizeType generationBatchId) const: flip-flop between 2 graph instances for each generation batch.

Public Members

SizeType numCtxBatches

SizeType numGenBatches

SizeType ctxBatchSize

SizeType genBatchSize

namespace utils

Functions

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<> struct MemoryTypeString<MemoryType::kGPU>

Public Static Attributes

static constexpr auto value = "GPU"

template<> struct MemoryTypeString<MemoryType::kCPU>

Public Static Attributes

static constexpr auto value = "CPU"

template<> struct MemoryTypeString<MemoryType::kPINNED>

Public Static Attributes

static constexpr auto value = "PINNED"

template<> struct MemoryTypeString<MemoryType::kUVM>

Public Static Attributes

static constexpr auto value = "UVM"

template<> struct DataTypeTraits<nvinfer1::DataType::kFLOAT>

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kHALF>

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT8>

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT32>

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT64>

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT32, true>

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"

static constexpr auto size = sizeof(type)

template<> struct DataTypeTraits<nvinfer1::DataType::kINT64, true>

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned>

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned>

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"

static constexpr auto size = sizeof(type)

template<> struct TRTDataType<std::int8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8

template<> struct TRTDataType<std::int32_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32

template<> struct TRTDataType<std::uint32_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> struct TRTDataType<std::int64_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64

template<> struct TRTDataType<std::uint64_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> struct TRTDataType<std::uint8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8

namespace tensorrt_llm

namespace runtime

Typedefs

template<typename T> using PointerElementType = typename std::remove_reference_t<T>::element_type

Enums

enum class MemoryType : std::int32_t

Values:

enumerator kGPU

enumerator kCPU

enumerator kPINNED

enumerator kUVM

Functions

template<typename T> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept

template<typename T, typename D> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept

template<typename T> T const *bufferCast(IBuffer const &buffer)

template<typename T> T *bufferCast(IBuffer &buffer)

std::ostream &operator<<(std::ostream &output, IBuffer const &buffer): Utility function to print a buffer.

template<MemoryType T> struct MemoryTypeString

template<> kGPU >

Public Static Attributes

static constexpr auto value = "GPU"

template<> kCPU >

Public Static Attributes

static constexpr auto value = "CPU"

template<> kPINNED >

Public Static Attributes

static constexpr auto value = "PINNED"

template<> kUVM >

Public Static Attributes

static constexpr auto value = "UVM"

template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false> struct DataTypeTraits: #include <iBuffer.h>

For converting a TensorRT data type to a C++ data type.

template<> kFLOAT >

Public Types

using type = float

Public Static Attributes

static constexpr char name[] = "float"

static constexpr auto size = sizeof(type)

template<> kHALF >

Public Types

using type = half

Public Static Attributes

static constexpr char name[] = "half"

static constexpr auto size = sizeof(type)

template<> kINT8 >

Public Types

using type = std::int8_t

Public Static Attributes

static constexpr char name[] = "int8"

static constexpr auto size = sizeof(type)

template<> kINT32 >

Public Types

using type = std::int32_t

Public Static Attributes

static constexpr char name[] = "int32"

static constexpr auto size = sizeof(type)

template<> kINT64 >

Public Types

using type = std::int64_t

Public Static Attributes

static constexpr char name[] = "int64"

static constexpr auto size = sizeof(type)

template<> kINT32, true >

Public Types

using type = std::uint32_t

Public Static Attributes

static constexpr char name[] = "uint32"

static constexpr auto size = sizeof(type)

template<> kINT64, true >

Public Types

using type = std::uint64_t

Public Static Attributes

static constexpr char name[] = "uint64"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> kBOOL, kUnsigned >

Public Types

using type = bool

Public Static Attributes

static constexpr char name[] = "bool"

static constexpr auto size = sizeof(type)

template<bool kUnsigned> kUINT8, kUnsigned >

Public Types

using type = std::uint8_t

Public Static Attributes

static constexpr char name[] = "uint8"

static constexpr auto size = sizeof(type)

template<nvinfer1::DataType kDataType, bool kUnsigned> struct DataTypeTraits<kDataType, kUnsigned, true>

Public Types

using type = typename DataTypeTraits<kDataType, kUnsigned, false>::type*

Public Static Attributes

static constexpr char name[] = "*"

static constexpr auto size = sizeof(type)

class BufferDataType

#include <iBuffer.h>

A wrapper around nvinfer1::DataType that provides a support for pointer types.

Public Functions

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)

inline constexpr operator nvinfer1::DataType() const noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr bool isPointer() const noexcept

inline constexpr bool isUnsigned() const

inline constexpr std::size_t getSize() const noexcept

Public Static Attributes

static constexpr auto kTrtPointerType = nvinfer1::DataType::kINT64

Private Members

nvinfer1::DataType mDataType

bool mUnsigned

bool mPointer

template<typename T, bool = false> struct TRTDataType: #include <iBuffer.h>

For converting a C++ data type to a TensorRT data type.

template<> struct TRTDataType<float>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kFLOAT

template<> struct TRTDataType<half>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kHALF

template<> int8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8

template<> int32_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32

template<> uint32_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> int64_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64

template<> uint64_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> struct TRTDataType<bool>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kBOOL

template<> uint8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8

template<> struct TRTDataType<void*>

Public Static Attributes

static constexpr auto value = BufferDataType::kTrtPointerType 

template<typename T> struct TRTDataType<T*>

Public Static Attributes

static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

Private Static Attributes

static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}

class IBuffer

Subclassed by tensorrt_llm::runtime::ITensor

Public Types

using UniquePtr = std::unique_ptr<IBuffer>

using SharedPtr = std::shared_ptr<IBuffer>

using UniqueConstPtr = std::unique_ptr<IBuffer const>

using SharedConstPtr = std::shared_ptr<IBuffer const>

using DataType = nvinfer1::DataType

Public Functions

virtual void *data() = 0: Returns a pointer to underlying array.

virtual void const *data() const = 0: Returns a pointer to underlying array.

inline virtual void *data(std::size_t index): Returns a pointer to the underlying array at a given element index.

inline virtual void const *data(std::size_t index) const: Returns a pointer to the underlying array at a given element index.

virtual std::size_t getSize() const = 0: Returns the size (in number of elements) of the buffer.

inline virtual std::size_t getSizeInBytes() const: Returns the size (in bytes) of the buffer.

virtual std::size_t getCapacity() const = 0: Returns the capacity of the buffer.

virtual DataType getDataType() const = 0: Returns the data type of the buffer.

virtual char const *getDataTypeName() const

virtual MemoryType getMemoryType() const = 0: Returns the memory type of the buffer.

virtual char const *getMemoryTypeName() const

virtual void resize(std::size_t newSize) = 0: Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

virtual void release() = 0: Releases the buffer. It will be reset to nullptr.

virtual ~IBuffer() = default

IBuffer(IBuffer const&) = delete: Not allowed to copy.

IBuffer &operator=(IBuffer const&) = delete: Not allowed to copy.

Public Static Functions

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.

Parameters:

buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently resized.

Parameters:: tensor – The tensor to view.
Returns:: A view on the tensor.

static inline UniquePtr view(SharedPtr tensor, std::size_t size)

Returns a view on the underlying tensor with a different size.

Parameters:

tensor – The tensor to view.
size – The size of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)

static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)

Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.

Parameters:

data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.

Returns:

An IBuffer.

static inline UniquePtr wrap(void *data, DataType type, std::size_t size)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v)

static MemoryType memoryType(void const *data): Determine the memory type of a pointer.

Protected Functions

IBuffer() = default

inline std::size_t toBytes(std::size_t size) const: Returns an array index or size in bytes.

template<typename T> class BufferRange

Public Types

using value_type = T 

using size_type = std::size_t

using reference = value_type&

using const_reference = value_type const&

using pointer = T*

using const_pointer = T const*

using iterator = pointer 

using const_iterator = const_pointer 

Public Functions

inline BufferRange(T *data, size_type size)

inline explicit BufferRange(IBuffer &buffer)

inline iterator begin()

inline iterator end()

inline const_iterator begin() const

inline const_iterator end() const

inline const_iterator cbegin()

inline const_iterator cend()

inline const_iterator cbegin() const

inline const_iterator cend() const

inline size_type size() const

inline reference operator[](size_type index)

inline const_reference operator[](size_type index) const

Private Members

T *mData

size_type mSize

iGptDecoderBatch.h

namespace tensorrt_llm

namespace runtime

class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder 

#include <iGptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::GptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

using TokenPtr = std::unique_ptr<decoder_batch::Token const>

Public Functions

virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) = 0: Initialize the decoder at batchIdx with a new request.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0: Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &token) = 0: Wait for the call to forwardAsync associated with a token to complete.

inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input): Run one step for all requests and wait for completion on the host.

virtual TensorPtr getOutputIds(SizeType batchIdx) const = 0

Parameters:: batchIdx – index of the batch
Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual CudaEvent finalize(SizeType batchIdx) const = 0: Gather final beam search results for request batchIdx. Result will only be available after event returned.

virtual std::vector<bool> getFinished() const = 0

Returns:: [batchSize (actual)], marks finished requests (per batch)

virtual TensorPtr getCumLogProbs() const = 0

Returns:: [batchSize, beamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getCumLogProbs(SizeType batchIdx) const = 0

Returns:: [beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getLogProbs() const = 0

Returns:: [batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs(SizeType batchIdx) const = 0

Returns:: [beamWidth, maxSeqLen], cumulative log probabilities (per beam) for request batchIdx, on gpu

virtual TensorPtr getParentIds() const = 0

virtual std::vector<SizeType> getNbSteps() const = 0

Protected Functions

IGptDecoderBatch() = default

namespace decoder_batch

Typedefs

using Output = decoder::Output 

class Request

Public Types

using ConstTensorPtr = ITensor::SharedConstPtr 

using TensorPtr = ITensor::SharedPtr 

using BufferPtr = IBuffer::SharedPtr 

Public Functions

inline explicit Request(ConstTensorPtr ids, SizeType inputLen, std::optional<SizeType> maxNewTokens = std::nullopt, std::optional<SizeType> endId = std::nullopt)

inline SizeType generatedTokensPerStep() const

Public Members

ConstTensorPtr ids

SizeType inputLen

std::optional<SizeType> maxNewTokens

std::optional<SizeType> endId

BufferPtr draftTokens

std::optional<TensorPtr> draftLogits

TensorPtr embeddingBias

TensorPtr badWordsList

TensorPtr stopWordsList

bool computeCumLogProbs

bool computeLogProbs

class Input

Public Types

using TensorConstPtr = ITensor::SharedConstPtr 

using TensorPtr = ITensor::SharedPtr 

Public Functions

inline explicit Input(std::vector<TensorConstPtr> const &logits, std::vector<bool> const &active)

inline explicit Input(std::vector<TensorConstPtr> const &logits)

inline explicit Input(std::vector<TensorPtr> const &logits, std::vector<bool> const &active)

inline explicit Input(std::vector<TensorPtr> const &logits)

Public Members

std::vector<TensorConstPtr> logits

std::vector<bool> active

TensorConstPtr cacheIndirection

class Token

Public Functions

inline explicit Token(CudaEvent &&event, std::vector<bool> const &active)

Public Members

CudaEvent event

std::vector<bool> active

iStatefulGptDecoder.h

namespace tensorrt_llm

namespace runtime

class IStatefulGptDecoder

#include <iStatefulGptDecoder.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::IGptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, nvinfer1::DataType dtype) = 0: Setup the decoder before calling forward(), also calls reshapeBuffers.

virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) = 0: Initialize the decoder with new batch of inputs.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0: Run one step for all requests without blocking the host thread.

virtual void forwardSync() = 0: Wait for the last call to forwardAsync to complete.

inline virtual void forward(decoder::Output &output, decoder::Input const &input): Run one step for all requests.

virtual void finalize() const = 0: Gather final beam search results for all requests.

virtual TensorPtr getOutputIds() const = 0

Returns:: [batchSize, beamWidth, maxSequenceLength], all token ids, on gpu

virtual TensorPtr getCumLogProbs() const = 0

Returns:: [batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getLogProbs() const = 0

Returns:: [batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu

virtual TensorPtr getNewTokens(SizeType iter = 0) const = 0

Get tokens generated in one step of last forward pass.

Parameters:: iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
Returns:: [batchSize, beamWidth], tokens generated in iter (per beam), on gpu

virtual TensorPtr getAllNewTokens() const = 0

Get maxTokensPerStep tokens generated in the last forward pass.

Returns:: [maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

virtual TensorPtr getNbFinished() const = 0

Returns:: [1], number of finished sequences, in pinned host memory

virtual ~IStatefulGptDecoder() = default

Protected Functions

IStatefulGptDecoder() = default

namespace decoder

class Input

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline explicit Input(TensorPtr logits)

Public Members

TensorPtr logits

TensorPtr cacheIndirection

class Output

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

Output() = default

Public Members

TensorPtr cacheIndirection

TensorPtr sequenceLengths

iTensor.h

namespace nvinfer1

namespace tensorrt_llm

namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims): Utility function to print a shape.

std::ostream &operator<<(std::ostream &output, ITensor const &tensor): Utility function to print a tensor with its shape.

class ITensor : public virtual tensorrt_llm::runtime::IBuffer 

Public Types

using UniquePtr = std::unique_ptr<ITensor>

using SharedPtr = std::shared_ptr<ITensor>

using UniqueConstPtr = std::unique_ptr<ITensor const>

using SharedConstPtr = std::shared_ptr<ITensor const>

using Shape = nvinfer1::Dims

using DimType = std::remove_reference_t<decltype(Shape::d[0])>

Public Functions

~ITensor() override = default

virtual Shape const &getShape() const = 0: Returns the tensor dimensions.

virtual void reshape(Shape const &dims) = 0: Sets the tensor dimensions. The new size of the tensor will be volume(dims)

inline virtual void resize(std::size_t newSize) override: Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

ITensor(ITensor const&) = delete: Not allowed to copy.

ITensor &operator=(ITensor const&) = delete: Not allowed to copy.

inline void squeeze(SizeType dim): Removes the given unit dimensions from this tensor.

inline void unsqueeze(SizeType dim): Adds a unit dimension at the specified position.

inline bool shapeEquals(Shape const &other) const

inline bool shapeEquals(std::initializer_list<SizeType> const &other) const

template<typename T> inline bool shapeEquals(T const *dims, SizeType count) const

Public Static Functions

static inline std::int64_t volume(Shape const &dims): Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

static inline std::size_t volumeNonNegative(Shape const &shape): Returns the volume of the dimensions. Throws if d.nbDims < 0.

static Shape squeeze(Shape const &shape, SizeType dim)

Removes the given unit dimension from shape.

Parameters:

shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).

Returns:

A new shape without the unit dimension.

static Shape unsqueeze(Shape const &shape, SizeType dim)

Add a unit dimension to shape at the specified position.

Parameters:

shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.

Returns:

A new shape with the added unit dimension.

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

Parameters:

tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)

Returns a view on the underlying buffer (or tensor) with the given shape.

Parameters:

tensor – The tensor to view.
shape – The shape of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)

static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently reshaped.

Parameters:: tensor – The tensor to view.
Returns:: A view on the tensor.

static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

Parameters:

data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.

Returns:

An ITensor.

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape)

template<typename T> static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, Shape const &shape)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v, Shape const &shape)

static Shape makeShape(std::initializer_list<SizeType> const &dims): A convenience function to create a tensor shape with the given dimensions.

static std::string toString(Shape const &dims): A convenience function for converting a tensor shape to a string.

static inline bool shapeEquals(Shape const &lhs, Shape const &rhs): A convenience function to compare shapes.

template<typename T> static inline bool shapeEquals(Shape const &lhs, T const *dims, SizeType count): A convenience function to compare shapes.

Protected Functions

ITensor() = default

Protected Static Functions

static inline DimType castSize(size_t newSize)

ipcUtils.h

namespace tensorrt_llm

namespace runtime

Functions

void setPeerAccess(WorldConfig const &worldConfig, bool enable = true)

class IpcMemory

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

IpcMemory(WorldConfig const &worldConfig, std::size_t bufferSize)

~IpcMemory()

inline const std::vector<void*> &getCommPtrsTensor() const

Public Static Attributes

static constexpr size_t FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t)

Private Functions

void allocateIpcMemory()

void destroyIpcMemory()

Private Members

WorldConfig mWorldConfig

std::vector<void*> mCommPtrs

std::size_t mBufferSize

void *mBufferPtr = {nullptr}

memoryCounters.h

namespace tensorrt_llm

namespace runtime

class MemoryCounters

Public Types

using SizeType = std::size_t

using DiffType = std::ptrdiff_t

Public Functions

MemoryCounters() = default

inline SizeType getGpu() const

inline SizeType getCpu() const

inline SizeType getPinned() const

inline SizeType getUVM() const

inline DiffType getGpuDiff() const

inline DiffType getCpuDiff() const

inline DiffType getPinnedDiff() const

inline DiffType getUVMDiff() const

template<MemoryType T> inline void allocate(SizeType size)

void allocate(MemoryType memoryType, SizeType size)

template<MemoryType T> inline void deallocate(SizeType size)

void deallocate(MemoryType memoryType, SizeType size)

std::string toString() const

Public Static Functions

static inline MemoryCounters &getInstance()

static std::string bytesToString(SizeType bytes, int precision = 2)

static std::string bytesToString(DiffType bytes, int precision = 2)

Private Members

SizeType mGpu = {}

SizeType mCpu = {}

SizeType mPinned = {}

SizeType mUVM = {}

DiffType mGpuDiff = {}

DiffType mCpuDiff = {}

DiffType mPinnedDiff = {}

DiffType mUVMDiff = {}

promptTuningParams.h

namespace tensorrt_llm

namespace runtime

template<typename TTensor> class GenericPromptTuningParams

Public Types

using TensorPtr = TTensor 

using SizeType = tensorrt_llm::runtime::SizeType 

Public Functions

inline explicit GenericPromptTuningParams(TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())

Public Members

TensorPtr embeddingTable

TensorPtr tasks

TensorPtr vocabSize

std::vector<bool> promptTuningEnabled

class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>

Public Types

using TensorPtr = ITensor::SharedPtr 

using SizeType = GenericPromptTuningParams::SizeType 

Public Functions

inline explicit PromptTuningParams(TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)

void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests, const std::vector<SizeType> &reqBeamWidths, const std::vector<SizeType> &reqPromptLengths, BufferManager const &manager, bool packedInput)

samplingConfig.h

namespace tensorrt_llm

namespace runtime

class SamplingConfig

Public Functions

inline explicit SamplingConfig(SizeType beamWidth = 1)

Public Members

SizeType beamWidth

OptVec<FloatType> temperature

OptVec<SizeType> minLength

OptVec<FloatType> repetitionPenalty

OptVec<FloatType> presencePenalty

OptVec<FloatType> frequencyPenalty

OptVec<SizeType> topK

OptVec<FloatType> topP

OptVec<uint64_t> randomSeed

OptVec<FloatType> topPDecay

OptVec<FloatType> topPMin

OptVec<SizeType> topPResetIds

OptVec<FloatType> beamSearchDiversityRate

OptVec<FloatType> lengthPenalty

OptVec<FloatType> draftAcceptanceThreshold

std::optional<bool> normalizeLogProbs

Private Types

using FloatType = float

template<typename T> using OptVec = std::optional<std::vector<T>>

tllmLogger.h

namespace tensorrt_llm

namespace runtime

class TllmLogger : public nvinfer1::ILogger

Public Functions

void log(Severity severity, nvinfer1::AsciiChar const *msg) noexcept override

Severity getLevel()

void setLevel(Severity level)

worldConfig.h

namespace tensorrt_llm

namespace runtime

class WorldConfig

Public Functions

explicit WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType>> const &deviceIds = std::nullopt)

inline constexpr SizeType getSize() const noexcept

inline constexpr SizeType getTensorParallelism() const noexcept

inline constexpr bool isTensorParallel() const noexcept

inline constexpr SizeType getPipelineParallelism() const noexcept

inline constexpr bool isPipelineParallel() const noexcept

inline constexpr SizeType getRank() const noexcept

inline constexpr SizeType getGpusPerNode() const noexcept

inline SizeType getGpusPerGroup() const noexcept

inline SizeType getDevice() const noexcept

inline constexpr SizeType getPipelineParallelRank() const noexcept

inline constexpr SizeType getTensorParallelRank() const noexcept

inline constexpr bool isFirstPipelineParallelRank() const noexcept

inline constexpr bool isLastPipelineParallelRank() const noexcept: Is my rank the last rank in its pipeline?

inline constexpr SizeType getLastRank() const noexcept

std::vector<SizeType> getPipelineParallelGroup() const

Public Static Functions

static bool validConfig(SizeType tensorParallelism, SizeType pipelineParallelism)

static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt, std::optional<std::vector<SizeType>> const &deviceIds = std::nullopt)

Public Static Attributes

static constexpr SizeType kDefaultGpusPerNode = 8

Private Members

SizeType mTensorParallelism

SizeType mPipelineParallelism

SizeType mRank

SizeType mGpusPerNode

std::vector<SizeType> mDeviceIds