Runtime

bufferManager.h

namespace tensorrt_llm

namespace runtime

class BufferManager

#include <bufferManager.h>

A helper class for managing memory on host and device.

Public Types

using IBufferPtr = IBuffer::UniquePtr 

using ITensorPtr = ITensor::UniquePtr 

using CudaStreamPtr = std::shared_ptr<CudaStream>

Public Functions

explicit BufferManager(CudaStreamPtr stream)

Construct a BufferManager.

Parameters:: cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).

IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an IBuffer of the given size on the GPU.

ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an ITensor of the given dimensions on the GPU.

IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an IBuffer of the given size and memory type.

ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const: Allocates an ITensor of the given dimensions and memory type.

inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: Create an empty IBuffer of the given memory type. It may be resized later.

inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const: Create an empty ITensor of the given memory type. It may be reshaped later.

void setZero(IBuffer &buffer) const: Set the contents of the given buffer to zero.

void copy(void const *src, IBuffer &dst, MemoryType srcType) const: Copy src to dst.

void copy(IBuffer const &src, void *dst, MemoryType dstType) const: Copy src to dst.

inline void copy(void const *src, IBuffer &dst) const: Copy src to dst.

inline void copy(IBuffer const &src, void *dst) const: Copy src to dst.

void copy(IBuffer const &src, IBuffer &dst) const: Copy src to dst.

IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const: Copy src into a new IBuffer with a potentially different memory type.

ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

template<typename T> inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const: Copy src into a new IBuffer with a potentially different memory type.

template<typename T> inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

template<typename T> inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const: Copy src into a new ITensor with a potentially different memory type.

CudaStream const &getStream() const: Get the underlying cuda stream.

Public Static Functions

static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates an IBuffer of the given size on the CPU.

static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates an ITensor of the given dimensions on the CPU.

static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned IBuffer of the given size on the CPU.

static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE): Allocates a pinned ITensor of the given dimensions on the CPU.

Public Static Attributes

static constexpr auto kBYTE_TYPE = nvinfer1::DataType::kUINT8

Private Members

CudaStreamPtr mStream

Private Static Functions

static void initMemoryPool(int device)

common.h

namespace tensorrt_llm

namespace runtime

Typedefs

using SizeType = std::int32_t

using TokenIdType = std::int32_t

template<typename T> using StringPtrMap = std::unordered_map<std::string, std::shared_ptr<T>>

cudaEvent.h

namespace tensorrt_llm

namespace runtime

class CudaEvent

Public Types

using pointer = cudaEvent_t

Public Functions

inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)

Creates a new cuda event. The event will be destroyed in the destructor.

Parameters:: flags – Flags for event creation. By default, event timing is disabled.

inline explicit CudaEvent(pointer event, bool ownsEvent = true)

Pass an existing cuda event to this object.

Parameters:

event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.

inline pointer get() const: Returns the event associated with this object.

inline void synchronize() const: Synchronizes the event.

Private Types

using element_type = std::remove_pointer_t<pointer>

using EventPtr = std::unique_ptr<element_type, Deleter>

Private Members

EventPtr mEvent

class Deleter

Public Functions

inline explicit Deleter(bool ownsEvent)

inline explicit Deleter()

inline constexpr void operator()(pointer event) const

Private Members

bool mOwnsEvent

cudaStream.h

namespace tensorrt_llm

namespace runtime

class CudaStream

Public Functions

inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)

Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.

Parameters:

flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.

inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)

Pass an existing cuda stream to this object.

Parameters:

stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.

inline int getDevice() const: Returns the device on which the stream was created.

inline cudaStream_t get() const: Returns the stream associated with this object.

inline void synchronize() const: Synchronizes the stream.

inline void record(CudaEvent::pointer event) const: Record an event on the stream.

inline void record(CudaEvent const &event) const: Record an event on the stream.

inline void wait(CudaEvent::pointer event) const: Wait for an event.

inline void wait(CudaEvent const &event) const: Wait for an event.

Private Types

using StreamPtr = std::unique_ptr<std::remove_pointer_t<cudaStream_t>, Deleter>

Private Members

StreamPtr mStream

int mDevice = {-1}

class Deleter

Public Functions

inline explicit Deleter(bool ownsStream)

inline explicit Deleter()

inline constexpr void operator()(cudaStream_t stream) const

Private Members

bool mOwnsStream

decodingInput.h

namespace tensorrt_llm

namespace runtime

class DecodingInput

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline DecodingInput(SizeType maxLength, SizeType batchSize, TensorPtr logits, TensorPtr endIds)

Public Members

SizeType step

SizeType maxLength

SizeType batchSize

TensorPtr logits

TensorPtr endIds

TensorPtr sequenceLimitLength

TensorPtr embeddingBias

TensorPtr lengths

TensorPtr badWordsList

TensorPtr stopWordsList

TensorPtr noRepeatNgramSize

TensorPtr cacheIndirection

decodingOutput.h

namespace tensorrt_llm

namespace runtime

class DecodingOutput

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

inline explicit DecodingOutput(TensorPtr ids)

Public Members

TensorPtr ids

TensorPtr newTokens

TensorPtr finished

TensorPtr finishedSum

TensorPtr logProbs

TensorPtr cumLogProbs

TensorPtr parentIds

TensorPtr lengths

TensorPtr cacheIndirection

BeamHypotheses beamHypotheses

Public Static Attributes

static constexpr float kNegativeInfinity = -1e20f

class BeamHypotheses

Public Functions

void empty(BufferManager &manager)

void reshape(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)

void release()

void init(BufferManager &manager, TokenIdType endId)

BeamHypotheses slice(SizeType batchIndex, SizeType size) const

Public Members

TensorPtr outputIdsTgt

TensorPtr sequenceLengthsTgt

TensorPtr cumLogProbs

TensorPtr normedScores

TensorPtr logProbs

TensorPtr minNormedScores

TensorPtr numBeams

TensorPtr isDone

generationInput.h

namespace tensorrt_llm

namespace runtime

class GenerationInput

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

inline explicit GenerationInput(SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)

Public Members

SizeType endId

SizeType padId

TensorPtr ids

TensorPtr lengths

bool packed

TensorPtr embeddingBiasOpt

TensorPtr badWordsList

TensorPtr stopWordsList

std::optional<SizeType> maxNewTokens

generationOutput.h

namespace tensorrt_llm

namespace runtime

class GenerationOutput

Public Types

using TensorPtr = ITensor::SharedPtr 

using Callback = std::function<void(TensorPtr const &ids, SizeType step, bool finished)>

Public Functions

inline explicit GenerationOutput(TensorPtr ids)

Public Members

TensorPtr ids

TensorPtr logProbs

TensorPtr contextLogits

Callback onTokenGenerated

gptDecoder.h

namespace tensorrt_llm

namespace layers

template<typename T> class DynamicDecodeLayer

namespace runtime

template<typename T> class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder 

Public Types

using CudaStreamPtr = BufferManager::CudaStreamPtr 

Public Functions

GptDecoder(size_t vocabSize, size_t vocabSizePadded, CudaStreamPtr const &stream)

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize) override

virtual bool forward(DecodingOutput &output, DecodingInput const &input) override

virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override

Private Members

BufferManager mManager

common::CudaAllocator mAllocator

std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer

class IGptDecoder

Subclassed by tensorrt_llm::runtime::GptDecoder< T >

Public Functions

virtual ~IGptDecoder() = default

virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize) = 0

virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0

virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0

Public Static Functions

static void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager)

static inline std::unique_ptr<IGptDecoder> create(nvinfer1::DataType dtype, size_t vocabSize, size_t vocabSizePadded, BufferManager::CudaStreamPtr const &stream)

gptDecoderBatch.h

namespace tensorrt_llm

namespace runtime

class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch 

#include <gptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)

virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) override: Setup the decoder before calling forward()

virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) override: Initialize the decoder at batchIdx with a new request.

virtual void newBatch(GenerationInput const &inputs, SamplingConfig const &samplingConfig) override: Initialize the decoder with new batch of inputs.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override: Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &e) override: Wait for the call to forwardAsync associated with a token to complete.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override: Run one step for all requests without blocking the host thread.

virtual bool isFinishedSync() override: Wait for the last call to forwardAsync to complete and return whether all sequences have finished.

inline virtual std::vector<bool> getFinished() const override

Returns:: [batchSize], indicators of finished requests

inline virtual TensorPtr getOutputIds(SizeType batchIdx) const override

Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

inline virtual TensorPtr getOutputIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu

virtual std::tuple<CudaEvent, TensorPtr> getFinalOutputIds(SizeType batchIdx) const override

Execute postProcessRequest and returns OutputIds for request batchIdx. Result will only be available after event returned

Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual TensorPtr getFinalOutputIds() const override

Execute postProcessRequest and returns OutputIds.

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu

inline virtual TensorPtr getParentIds() const override

Returns:: [batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu

inline virtual TensorPtr getFinishedBeams() const override

Returns:: [batchSize, maxBeamWidth], marks finished requests (per beam), on gpu

inline virtual TensorPtr getOutputLengths() const override

Returns:: [batchSize, maxBeamWidth], total sequence lengths (per beam), on gpu

inline virtual TensorPtr getCumLogProbs() const override

Returns:: [batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu

inline virtual TensorPtr getNewTokens() const override

Returns:: [batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu

inline virtual std::vector<SizeType> getNbSteps() const override

Returns:: [batchSize], the number of generation steps executed on each request

inline virtual TensorPtr getNbFinished() const override

Returns:: [1], number of finished sequences, in pinned host memory

Private Types

using GptDecoderPtr = std::unique_ptr<IGptDecoder>

using DecodingInputPtr = std::unique_ptr<DecodingInput>

using DecodingOutputPtr = std::unique_ptr<DecodingOutput>

Private Functions

CudaEvent postProcessRequest(SizeType batchIdx) const: Gather final results for request batchIdx

Private Members

const std::size_t mVocabSize

const std::size_t mVocabSizePadded

CudaStreamPtr mStream

BufferManager mBufferManager

TokenPtr mForwardToken

CudaEvent mForwardEvent

std::vector<CudaStreamPtr> mStreams

std::vector<GptDecoderPtr> mDecoders

std::vector<DecodingInputPtr> mDecodingInputs

std::vector<DecodingOutputPtr> mDecodingOutputs

DecodingInputPtr mJointDecodingInput

DecodingOutputPtr mJointDecodingOutput

std::vector<SizeType> mNbSteps

std::vector<bool> mFinished

TensorPtr mFinishedSum

std::vector<SizeType> mMaxNewTokens

std::vector<SizeType> mBeamWidths

SizeType mMaxSequenceLength = {}

SizeType mActualBatchSize = {}

gptJsonConfig.h

namespace tensorrt_llm

namespace runtime

class GptJsonConfig

Public Functions

inline GptJsonConfig(std::string name, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)

inline GptModelConfig getModelConfig() const

inline std::string const &getName() const

inline std::string const &getPrecision() const

inline constexpr SizeType getTensorParallelism() const

inline constexpr SizeType getPipelineParallelism() const

inline constexpr SizeType getWorldSize() const

std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const

inline std::string engineFilename(WorldConfig const &worldConfig) const

Public Static Functions

static GptJsonConfig parse(std::string const &json)

static GptJsonConfig parse(std::istream &json)

static GptJsonConfig parse(std::filesystem::path const &path)

Private Members

const std::string mName

const std::string mPrecision

const SizeType mTensorParallelism

const SizeType mPipelineParallelism

const GptModelConfig mGptModelConfig

gptModelConfig.h

namespace tensorrt_llm

namespace runtime

class GptModelConfig

Public Types

enum class ModelVariant : std::int32_t

Values:

enumerator kGpt

enumerator kGlm

Public Functions

inline explicit constexpr GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)

inline constexpr SizeType getVocabSize() const noexcept

inline constexpr SizeType getVocabSizePadded(SizeType worldSize) const noexcept

inline constexpr SizeType getNbLayers(SizeType pipelineParallelism = 1) const

inline constexpr SizeType getNbHeads() const noexcept

inline constexpr SizeType getNbKvHeads() const noexcept

inline constexpr void setNbKvHeads(SizeType nbKvHeads) noexcept

inline constexpr SizeType getHiddenSize() const noexcept

inline constexpr SizeType getSizePerHead() const noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr bool useGptAttentionPlugin() const noexcept

inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept

inline constexpr bool usePackedInput() const noexcept

inline constexpr void usePackedInput(bool inputPacked) noexcept

inline constexpr bool usePagedKvCache() const noexcept

inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept

inline constexpr SizeType getTokensPerBlock() const noexcept

inline constexpr void setTokensPerBlock(SizeType TokensPerBlock) noexcept

inline constexpr common::QuantMode getQuantMode() const noexcept

inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept

inline constexpr bool supportsInflightBatching() const noexcept

inline constexpr SizeType getMaxBatchSize() const noexcept

inline constexpr void setMaxBatchSize(SizeType maxBatchSize) noexcept

inline constexpr SizeType getMaxInputLen() const noexcept

inline constexpr void setMaxInputLen(SizeType maxInputLen) noexcept

inline constexpr SizeType getMaxOutputLen() const noexcept

inline constexpr void setMaxOutputLen(SizeType maxOutputLen) noexcept

inline constexpr std::optional<SizeType> getMaxNumTokens() const noexcept

inline constexpr void setMaxNumTokens(std::optional<SizeType> maxNumTokens) noexcept

inline constexpr bool computeContextLogits() const noexcept

inline constexpr void computeContextLogits(bool computeContextLogits) noexcept

inline ModelVariant getModelVariant() const

inline void setModelVariant(ModelVariant modelVariant)

inline constexpr bool useCustomAllReduce() const noexcept

inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept

Private Members

SizeType mVocabSize

SizeType mNbLayers

SizeType mNbHeads

SizeType mNbKvHeads

SizeType mHiddenSize

nvinfer1::DataType mDataType

bool mUseGptAttentionPlugin

bool mInputPacked

bool mPagedKvCache

SizeType mTokensPerBlock

common::QuantMode mQuantMode

SizeType mMaxBatchSize

SizeType mMaxInputLen

SizeType mMaxOutputLen

std::optional<SizeType> mMaxNumTokens

bool mComputeContextLogits

ModelVariant mModelVariant

bool mUseCustomAllReduce

gptSession.h

namespace tensorrt_llm

namespace batch_manager

namespace kv_cache_manager

namespace runtime

class GptSession

Public Types

using LoggerPtr = std::shared_ptr<nvinfer1::ILogger>

Public Functions

GptSession(GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)

inline GptSession(GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)

inline GptSession(GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)

nvinfer1::ILogger &getLogger() const

BufferManager &getBufferManager() const

inline GptModelConfig const &getModelConfig() const

inline WorldConfig const &getWorldConfig() const

inline int getDevice() const noexcept

inline bool isCudaGraphMode() const noexcept

inline void setCudaGraphMode(bool value)

void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, bool decoderPerRequest, std::optional<SizeType> maxTokensInPagedKvCache = std::nullopt, std::optional<SizeType> numMicroBatches = std::nullopt)

Initialize buffers for the given sizes. generate may be called with batch size and beam width smaller than the setup parameters.

maxBatchSize will be devided by the number of micro batches to initialize each batch buffer.

inline void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)

Private Types

using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager

Private Functions

void generateSingleBatch(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)

void generateMultiBatch(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)

void createContexts(SizeType numMicroBatches)

void createBuffers(SizeType numMicroBatches)

void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches)

void createKvCacheManagers(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, SizeType numMicroBatches, std::optional<SizeType> maxTokensInPagedKvCache)

void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)

void decoderStepAsync(ITensor::SharedPtr &outputIds, ITensor::SharedPtr &newTokens, SizeType decoderStep, SizeType microBatchId): Execute decoder on last PP rank, receive decoder output on other PP ranks.

bool shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId): Synchronize with the decoder and return the shouldStop flag.

void finalizeOutputIds(ITensor &outputIds, SizeType microBatchId)

Collect final output ids on last PP rank and send them to first PP rank.

Receives are asynchronous on host, so synchronization is required before access.

void kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId)

ITensor::SharedPtr initNewTokens(GenerationInput const &inputs, SamplingConfig const &samplingConfig, SizeType microBatchId)

Private Members

const GptModelConfig mModelConfig

const WorldConfig mWorldConfig

int mDevice = {-1}

std::shared_ptr<NcclCommunicator> mPipelineComm

std::shared_ptr<CudaStream> mCommStream

CudaEvent mCommEvent = {}

SizeType mDecoderMaxSequenceLength = {}

LoggerPtr mLogger

std::shared_ptr<TllmRuntime> mRuntime

SizeType mNumMicroBatches

std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders

std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers

std::vector<std::shared_ptr<KvCacheManager>> mKvCacheManagers

std::vector<CudaEvent> mReceivedEvents

bool mCudaGraphMode = {false}

std::array<CudaGraphExecutor, 2> mCudaGraphInstances

class CudaGraphExecutor

Public Functions

CudaGraphExecutor() = default

inline ~CudaGraphExecutor()

inline bool hasInstance()

void clear()

void prepareNextGraph(TllmRuntime const &runtime, SizeType nextContextId)

void launch(CudaStream const &stream)

Private Types

using cudaGraphExecPtr = cudaGraphExec_t

Private Functions

void create(cudaGraph_t const &graph)

bool update(cudaGraph_t const &graph)

void uploadToStream(CudaStream const &stream)

Private Members

cudaGraphExecPtr mInstance

namespace utils

Functions

std::vector<uint8_t> loadEngine(std::string const &enginePath)

iBuffer.h

template<> struct MemoryTypeString<MemoryType::kGPU>

Public Static Attributes

static constexpr auto value = "GPU"

template<> struct MemoryTypeString<MemoryType::kCPU>

Public Static Attributes

static constexpr auto value = "CPU"

template<> struct MemoryTypeString<MemoryType::kPINNED>

Public Static Attributes

static constexpr auto value = "PINNED"

template<> struct CppDataType<nvinfer1::DataType::kFLOAT>

Public Types

using type = float

template<> struct CppDataType<nvinfer1::DataType::kHALF>

Public Types

using type = half

template<> struct CppDataType<nvinfer1::DataType::kINT8>

Public Types

using type = std::int8_t

template<> struct CppDataType<nvinfer1::DataType::kINT32>

Public Types

using type = std::int32_t

template<> struct CppDataType<nvinfer1::DataType::kINT64>

Public Types

using type = std::int64_t

template<> struct CppDataType<nvinfer1::DataType::kINT32, true>

Public Types

using type = std::uint32_t

template<> struct CppDataType<nvinfer1::DataType::kINT64, true>

Public Types

using type = std::uint64_t

template<bool kUnsigned> struct CppDataType<nvinfer1::DataType::kBOOL, kUnsigned>

Public Types

using type = bool

template<bool kUnsigned> struct CppDataType<nvinfer1::DataType::kUINT8, kUnsigned>

Public Types

using type = std::uint8_t

template<> struct TRTDataType<std::int8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8

template<> struct TRTDataType<std::int32_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32

template<> struct TRTDataType<std::uint32_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> struct TRTDataType<std::int64_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64

template<> struct TRTDataType<std::uint64_t>

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> struct TRTDataType<std::uint8_t>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8

namespace tensorrt_llm

namespace runtime

Typedefs

template<typename T> using PointerElementType = typename std::remove_reference_t<T>::element_type

Enums

enum class MemoryType : std::int32_t

Values:

enumerator kGPU

enumerator kCPU

enumerator kPINNED

Functions

template<typename T> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::shared_ptr<T> const &ptr) noexcept

template<typename T, typename D> std::shared_ptr<std::remove_const_t<T>> constPointerCast(std::unique_ptr<T, D> &&ptr) noexcept

template<typename T> T const *bufferCast(IBuffer const &buffer)

template<typename T> T *bufferCast(IBuffer &buffer)

std::ostream &operator<<(std::ostream &output, IBuffer const &buffer): Utility function to print a buffer.

class BufferDataType

#include <iBuffer.h>

A wrapper around nvinfer1::DataType that provides a support for pointer types.

Public Functions

inline constexpr BufferDataType(nvinfer1::DataType dataType, bool _unsigned = false, bool pointer = false)

inline constexpr operator nvinfer1::DataType() const noexcept

inline constexpr nvinfer1::DataType getDataType() const noexcept

inline constexpr bool isPointer() const noexcept

inline constexpr bool isUnsigned() const

inline constexpr std::size_t getSize() const noexcept

Public Static Attributes

static constexpr auto kTrtPointerType = nvinfer1::DataType::kINT64

Private Members

nvinfer1::DataType mDataType

bool mUnsigned

bool mPointer

template<typename T> class BufferRange

Public Types

using value_type = T 

using size_type = std::size_t

using reference = value_type&

using const_reference = value_type const&

using pointer = T*

using const_pointer = T const*

using iterator = pointer 

using const_iterator = const_pointer 

Public Functions

inline explicit BufferRange(IBuffer &buffer)

inline iterator begin()

inline iterator end()

inline const_iterator begin() const

inline const_iterator end() const

inline const_iterator cbegin()

inline const_iterator cend()

inline const_iterator cbegin() const

inline const_iterator cend() const

inline size_type size() const

inline reference operator[](size_type index)

inline const_reference operator[](size_type index) const

Private Members

T *mData

size_type mSize

template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false> struct CppDataType: #include <iBuffer.h>

For converting a TensorRT data type to a C++ data type.

template<nvinfer1::DataType kDataType, bool kUnsigned> struct CppDataType<kDataType, kUnsigned, true>

Public Types

using type = typename CppDataType<kDataType, kUnsigned, false>::type*

template<bool kUnsigned> kBOOL, kUnsigned >

Public Types

using type = bool

template<> kFLOAT >

Public Types

using type = float

template<> kHALF >

Public Types

using type = half

template<> kINT32 >

Public Types

using type = std::int32_t

template<> kINT32, true >

Public Types

using type = std::uint32_t

template<> kINT64 >

Public Types

using type = std::int64_t

template<> kINT64, true >

Public Types

using type = std::uint64_t

template<> kINT8 >

Public Types

using type = std::int8_t

template<bool kUnsigned> kUINT8, kUnsigned >

Public Types

using type = std::uint8_t

class IBuffer

Subclassed by tensorrt_llm::runtime::ITensor

Public Types

using UniquePtr = std::unique_ptr<IBuffer>

using SharedPtr = std::shared_ptr<IBuffer>

using UniqueConstPtr = std::unique_ptr<IBuffer const>

using SharedConstPtr = std::shared_ptr<IBuffer const>

using DataType = nvinfer1::DataType

Public Functions

virtual void *data() = 0: Returns a pointer to underlying array.

virtual void const *data() const = 0: Returns a pointer to underlying array.

inline virtual void *data(std::size_t index): Returns a pointer to the underlying array at a given element index.

inline virtual void const *data(std::size_t index) const: Returns a pointer to the underlying array at a given element index.

virtual std::size_t getSize() const = 0: Returns the size (in number of elements) of the buffer.

inline virtual std::size_t getSizeInBytes() const: Returns the size (in bytes) of the buffer.

virtual std::size_t getCapacity() const = 0: Returns the capacity of the buffer.

virtual DataType getDataType() const = 0: Returns the data type of the buffer.

virtual MemoryType getMemoryType() const = 0: Returns the memory type of the buffer.

virtual void resize(std::size_t newSize) = 0: Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.

virtual void release() = 0: Releases the buffer. It will be reset to nullptr.

virtual ~IBuffer() = default

IBuffer(IBuffer const&) = delete: Not allowed to copy.

IBuffer &operator=(IBuffer const&) = delete: Not allowed to copy.

Public Static Functions

static UniquePtr slice(SharedPtr buffer, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying buffer. The view will have the same data type as buffer.

Parameters:

buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr buffer, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently resized.

Parameters:: tensor – The tensor to view.
Returns:: A view on the tensor.

static inline UniquePtr view(SharedPtr tensor, std::size_t size)

Returns a view on the underlying tensor with a different size.

Parameters:

tensor – The tensor to view.
size – The size of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)

static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)

Wraps the given data in an IBuffer. The IBuffer will not own the underlying data and cannot be resized beyond capacity.

Parameters:

data – The data to wrap.
type – The data type of the data.
size – The size of the buffer.
capacity – The capacity of the buffer.

Returns:

An IBuffer.

static inline UniquePtr wrap(void *data, DataType type, std::size_t size)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, std::size_t size)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v)

static MemoryType memoryType(void const *data): Determine the memory type of a pointer.

Protected Functions

IBuffer() = default

inline std::size_t toBytes(std::size_t size) const: Returns an array index or size in bytes.

template<MemoryType T> struct MemoryTypeString

template<> kCPU >

Public Static Attributes

static constexpr auto value = "CPU"

template<> kGPU >

Public Static Attributes

static constexpr auto value = "GPU"

template<> kPINNED >

Public Static Attributes

static constexpr auto value = "PINNED"

template<typename T, bool = false> struct TRTDataType: #include <iBuffer.h>

For converting a C++ data type to a TensorRT data type.

template<> struct TRTDataType<bool>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kBOOL

template<> struct TRTDataType<float>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kFLOAT

template<> struct TRTDataType<half>

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kHALF

template<> int32_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT32

template<> int64_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT64

template<> int8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kINT8

template<> uint32_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}

template<> uint64_t >

Public Static Attributes

static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}

template<> uint8_t >

Public Static Attributes

static constexpr auto value = nvinfer1::DataType::kUINT8

template<typename T> struct TRTDataType<T*>

Public Static Attributes

static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}

Private Static Attributes

static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}

template<> struct TRTDataType<void*>

Public Static Attributes

static constexpr auto value = BufferDataType::kTrtPointerType 

iGptDecoderBatch.h

namespace tensorrt_llm

namespace runtime

class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder 

#include <iGptDecoderBatch.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::GptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

using TokenPtr = std::unique_ptr<decoder_batch::Token const>

Public Functions

virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) = 0: Initialize the decoder at batchIdx with a new request.

virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0: Run one step for all requests without blocking the host process and return the token for synchronization.

virtual void forwardSync(decoder_batch::Token const &token) = 0: Wait for the call to forwardAsync associated with a token to complete.

inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input): Run one step for all requests and wait for completion on the host.

virtual TensorPtr getOutputIds(SizeType batchIdx) const = 0

Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual std::tuple<CudaEvent, TensorPtr> getFinalOutputIds(SizeType batchIdx) const = 0

Execute postProcessRequest and returns OutputIds for request batchIdx. Result will only be available after event returned

Returns:: [maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request batchIdx, on gpu

virtual TensorPtr getFinishedBeams() const = 0

Returns:: [batchSize, beamWidth], marks finished requests (per beam), on gpu

virtual TensorPtr getOutputLengths() const = 0

Returns:: [batchSize, beamWidth], total sequence lengths (per beam), on gpu

virtual std::vector<bool> getFinished() const = 0

Returns:: [batchSize (actual)], marks finished requests (per batch)

virtual TensorPtr getCumLogProbs() const = 0

Returns:: [batchSize, beamWidth], cumulative log probabilities (per beam), on gpu

virtual TensorPtr getParentIds() const = 0

virtual std::vector<SizeType> getNbSteps() const = 0

Protected Functions

IGptDecoderBatch() = default

namespace decoder_batch

Typedefs

using Output = decoder::Output 

class Input : public tensorrt_llm::runtime::decoder::Input 

Public Types

using Base = decoder::Input 

Public Functions

inline explicit Input(TensorPtr logits)

inline explicit Input(TensorPtr logits, std::vector<bool> const &active)

Public Members

std::vector<bool> active

class Request

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline explicit Request(TensorPtr ids, std::optional<SizeType> maxNewTokens = std::nullopt, std::optional<SizeType> endId = std::nullopt, std::optional<SizeType> padId = std::nullopt)

Public Members

TensorPtr ids

std::optional<SizeType> maxNewTokens

std::optional<SizeType> endId

TensorPtr embeddingBias

TensorPtr badWordsList

TensorPtr stopWordsList

class Token

Public Functions

inline explicit Token(CudaEvent &&event, std::vector<bool> const &active)

Public Members

CudaEvent event

std::vector<bool> active

iStatefulGptDecoder.h

namespace tensorrt_llm

namespace runtime

class IStatefulGptDecoder

#include <iStatefulGptDecoder.h>

GPT decoder class with support for in-flight batching.

Subclassed by tensorrt_llm::runtime::IGptDecoderBatch

Public Types

using CudaStreamPtr = std::shared_ptr<CudaStream>

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) = 0: Setup the decoder before calling forward(), also calls reshapeBuffers.

virtual void newBatch(GenerationInput const &inputs, SamplingConfig const &samplingConfig) = 0: Initialize the decoder with new batch of inputs.

virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0: Run one step for all requests without blocking the host thread.

virtual bool isFinishedSync() = 0: Wait for the last call to forwardAsync to complete and return whether all sequences have finished.

inline virtual bool forward(decoder::Output &output, decoder::Input const &input): Run one step for all requests.

virtual TensorPtr getFinalOutputIds() const = 0: Gather final results for all requests.

virtual TensorPtr getOutputIds() const = 0

Returns:: [batchSize, beamWidth, maxSequenceLength], all token ids, on gpu

virtual TensorPtr getNewTokens() const = 0

Returns:: [batchSize, beamWidth], latests generated tokens (per beam), on gpu

virtual TensorPtr getNbFinished() const = 0

Returns:: [1], number of finished sequences, in pinned host memory

Protected Functions

IStatefulGptDecoder() = default

namespace decoder

class Input

Subclassed by tensorrt_llm::runtime::decoder_batch::Input

Public Types

using TensorPtr = std::shared_ptr<ITensor const>

Public Functions

inline explicit Input(TensorPtr logits)

Public Members

TensorPtr logits

TensorPtr cacheIndirection

class Output

Public Types

using TensorPtr = std::shared_ptr<ITensor>

Public Functions

Output() = default

Public Members

TensorPtr cacheIndirection

TensorPtr sequenceLengths

iTensor.h

namespace nvinfer1

namespace tensorrt_llm

namespace runtime

Functions

inline std::ostream &operator<<(std::ostream &output, ITensor::Shape const &dims): Utility function to print a shape.

std::ostream &operator<<(std::ostream &output, ITensor const &tensor): Utility function to print a tensor with its shape.

class ITensor : public virtual tensorrt_llm::runtime::IBuffer 

Public Types

using UniquePtr = std::unique_ptr<ITensor>

using SharedPtr = std::shared_ptr<ITensor>

using UniqueConstPtr = std::unique_ptr<ITensor const>

using SharedConstPtr = std::shared_ptr<ITensor const>

using Shape = nvinfer1::Dims

Public Functions

virtual Shape const &getShape() const = 0: Returns the tensor dimensions.

virtual void reshape(Shape const &dims) = 0: Sets the tensor dimensions. The new size of the tensor will be volume(dims)

~ITensor() override = default

ITensor(ITensor const&) = delete: Not allowed to copy.

ITensor &operator=(ITensor const&) = delete: Not allowed to copy.

inline void squeeze(SizeType dim): Removes the given unit dimensions from this tensor.

Public Static Functions

static inline std::int64_t volume(Shape const &dims): Returns the volume of the dimensions. Returns -1 if d.nbDims < 0.

static inline std::size_t volumeNonNegative(Shape const &shape): Returns the volume of the dimensions. Throws if d.nbDims < 0.

static inline Shape squeeze(Shape const &shape, SizeType dim)

Removes the given unit dimension from shape.

Parameters:

shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).

Returns:

A new shape without the unit dimension.

static UniquePtr slice(SharedPtr tensor, std::size_t offset, std::size_t size)

Creates a sliced view on the underlying tensor. The view will have the same data type as tensor.

Parameters:

tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.

Returns:

A view on the buffer.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)

static inline UniquePtr slice(SharedPtr tensor, std::size_t offset)

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)

static UniquePtr view(IBuffer::SharedPtr buffer, Shape const &dims)

Returns a view on the underlying buffer (or tensor) with the given shape.

Parameters:

tensor – The tensor to view.
shape – The shape of the view.

Returns:

A view on the tensor.

template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0> static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)

static inline UniquePtr view(SharedPtr tensor)

Returns a view on the underlying tensor which can be independently reshaped.

Parameters:: tensor – The tensor to view.
Returns:: A view on the tensor.

static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)

Wraps the given data in an ITensor. The ITensor will not own the underlying data and cannot be reshaped beyond capacity.

Parameters:

data – The data to wrap.
type – The data type of the data.
shape – The shape of the tensor.
capacity – The capacity of the buffer.

Returns:

An ITensor.

static inline UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape)

template<typename T> static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)

template<typename T> static inline UniquePtr wrap(T *data, Shape const &shape)

template<typename T> static inline UniquePtr wrap(std::vector<T> &v, Shape const &shape)

static Shape makeShape(std::initializer_list<SizeType> const &dims): A convenience function to create a tensor shape with the given dimensions.

static std::string toString(Shape const &dims): A convenience function for converting a tensor shape to a string.

Protected Functions

ITensor() = default

ipcUtils.h

namespace tensorrt_llm

namespace runtime

Functions

void setPeerAccess(WorldConfig worldConfig, bool enable = true)

class IpcMemory

Public Types

using TensorPtr = ITensor::SharedPtr 

Public Functions

IpcMemory(WorldConfig worldConfig, std::size_t bufferSize)

~IpcMemory()

inline const std::vector<void*> &getCommPtrsTensor() const

Public Static Attributes

static constexpr size_t FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t)

Private Functions

void allocateIpcMemory()

void destroyIpcMemory()

Private Members

WorldConfig mWorldConfig

std::vector<void*> mCommPtrs

std::size_t mBufferSize

void *mBufferPtr

memoryCounters.h

namespace tensorrt_llm

namespace runtime

class MemoryCounters

Public Types

using SizeType = std::size_t

using DiffType = std::ptrdiff_t

Public Functions

MemoryCounters() = default

inline SizeType getGpu() const

inline SizeType getCpu() const

inline SizeType getPinned() const

inline DiffType getGpuDiff() const

inline DiffType getCpuDiff() const

inline DiffType getPinnedDiff() const

template<MemoryType T> inline void allocate(SizeType size)

void allocate(MemoryType memoryType, SizeType size)

template<MemoryType T> inline void deallocate(SizeType size)

void deallocate(MemoryType memoryType, SizeType size)

Public Static Functions

static inline MemoryCounters &getInstance()

static std::string bytesToString(SizeType bytes, int precision = 2)

static std::string bytesToString(DiffType bytes, int precision = 2)

Private Members

SizeType mGpu = {}

SizeType mCpu = {}

SizeType mPinned = {}

DiffType mGpuDiff = {}

DiffType mCpuDiff = {}

DiffType mPinnedDiff = {}

Private Static Attributes

static thread_local MemoryCounters mInstance

samplingConfig.h

namespace tensorrt_llm

namespace runtime

class SamplingConfig

Public Functions

inline explicit SamplingConfig(SizeType beamWidth = 1)

Public Members

SizeType beamWidth

OptVec<FloatType> temperature

OptVec<SizeType> minLength

OptVec<FloatType> repetitionPenalty

OptVec<FloatType> presencePenalty

OptVec<SizeType> topK

OptVec<FloatType> topP

OptVec<unsigned long long> randomSeed

OptVec<FloatType> topPDecay

OptVec<FloatType> topPMin

OptVec<SizeType> topPResetIds

std::optional<FloatType> beamSearchDiversityRate

std::optional<FloatType> lengthPenalty

Private Types

using FloatType = float

template<typename T> using OptVec = std::optional<std::vector<T>>

tllmLogger.h

namespace tensorrt_llm

namespace runtime

class TllmLogger : public ILogger

Public Functions

void log(Severity severity, nvinfer1::AsciiChar const *msg) noexcept override

Severity getLevel()

void setLevel(Severity level)

worldConfig.h

namespace tensorrt_llm

namespace runtime

class WorldConfig

Public Functions

inline explicit constexpr WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode)

inline constexpr SizeType getSize() const noexcept

inline constexpr SizeType getTensorParallelism() const noexcept

inline constexpr bool isTensorParallel() const noexcept

inline constexpr SizeType getPipelineParallelism() const noexcept

inline constexpr bool isPipelineParallel() const noexcept

inline constexpr SizeType getRank() const noexcept

inline constexpr SizeType getGpusPerNode() const noexcept

inline constexpr SizeType getDevice() const noexcept

inline constexpr SizeType getPipelineParallelRank() const noexcept

inline constexpr SizeType getTensorParallelRank() const noexcept

inline constexpr bool isFirstPipelineParallelRank() const noexcept

inline constexpr bool isLastPipelineParallelRank() const noexcept

std::vector<SizeType> getPipelineParallelGroup() const

Public Static Functions

static bool validConfig(nvinfer1::ILogger &logger, SizeType tensorParallelism, SizeType pipelineParallelism)

static WorldConfig mpi(nvinfer1::ILogger &logger, SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt)

static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt)

Public Static Attributes

static constexpr SizeType kDefaultGpusPerNode = 8

Private Members

SizeType mTensorParallelism

SizeType mPipelineParallelism

SizeType mRank

SizeType mGpusPerNode