Executor

executor.h

namespace tensorrt_llm

namespace executor

Variables

SizeType const kDefaultIterStatsMaxIterations = 1000

SizeType const kDefaultRequestStatsMaxIterations = 0

class SamplingConfig

#include <executor.h>

Sampling configuration.

Public Functions

explicit SamplingConfig(SizeType beamWidth = 1, std::optional<SizeType> const &topK = std::nullopt, std::optional<FloatType> const &topP = std::nullopt, std::optional<FloatType> const &topPMin = std::nullopt, std::optional<SizeType> const &topPResetIds = std::nullopt, std::optional<FloatType> const &topPDecay = std::nullopt, std::optional<RandomSeedType> const &randomSeed = std::nullopt, std::optional<FloatType> const &temperature = std::nullopt, std::optional<SizeType> const &minLength = std::nullopt, std::optional<FloatType> const &beamSearchDiversityRate = std::nullopt, std::optional<FloatType> const &repetitionPenalty = std::nullopt, std::optional<FloatType> const &presencePenalty = std::nullopt, std::optional<FloatType> const &frequencyPenalty = std::nullopt, std::optional<FloatType> const &lengthPenalty = std::nullopt, std::optional<SizeType> const &earlyStopping = std::nullopt): Constructor for SamplingConfig See description of parameters below.

bool operator==(SamplingConfig const &other) const

SizeType getBeamWidth() const

std::optional<SizeType> getTopK() const

std::optional<FloatType> getTopP() const

std::optional<FloatType> getTopPMin() const

std::optional<SizeType> getTopPResetIds() const

std::optional<FloatType> getTopPDecay() const

std::optional<RandomSeedType> getRandomSeed() const

std::optional<FloatType> getTemperature() const

std::optional<SizeType> getMinLength() const

std::optional<FloatType> getBeamSearchDiversityRate() const

std::optional<FloatType> getRepetitionPenalty() const

std::optional<FloatType> getPresencePenalty() const

std::optional<FloatType> getFrequencyPenalty() const

std::optional<FloatType> getLengthPenalty() const

std::optional<SizeType> getEarlyStopping() const

Private Members

SizeType mBeamWidth: The beam width. Default is 1 which disables beam search.

std::optional<SizeType> mTopK: Controls number of logits to sample from. Default is 0 (all logits).

std::optional<FloatType> mTopP: Controls the top-P probability to sample from. Default is 0.f.

std::optional<FloatType> mTopPMin: Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.

std::optional<SizeType> mTopPResetIds: Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.

std::optional<FloatType> mTopPDecay: Controls decay in the top-P algorithm. The decay value. Default is 1.f.

std::optional<RandomSeedType> mRandomSeed: Controls the random seed used by the random number generator in sampling.

std::optional<FloatType> mTemperature: Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.

std::optional<SizeType> mMinLength: Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.

std::optional<FloatType> mBeamSearchDiversityRate: Controls the diversity in beam search.

std::optional<FloatType> mRepetitionPenalty: Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.

std::optional<FloatType> mPresencePenalty: Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.

std::optional<FloatType> mFrequencyPenalty: Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.

std::optional<FloatType> mLengthPenalty: Controls how to penalize longer sequences in beam search. Default is 0.f.

std::optional<SizeType> mEarlyStopping: Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token)

Friends

friend class Serialization

class OutputConfig

#include <executor.h>

Configuration that controls the outputs of a Result.

Public Functions

explicit OutputConfig(bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, bool excludeInputFromOutput = false)

Public Members

bool returnLogProbs: Controls if Result should contain log probabilities. Default is false.

bool returnContextLogits: Controls if Result should contain the context logits. Default is false.

bool returnGenerationLogits: Controls if Result should contain the generation logits. Default is false.

bool excludeInputFromOutput: Controls if output tokens in Result should include the input tokens. Default is false.

class SpeculativeDecodingConfig

#include <executor.h>

Configuration for speculative decoding. Allows to include draft tokens, draft logits and specify acceptance threshold.

Public Functions

explicit SpeculativeDecodingConfig(VecTokens tokens, std::optional<Tensor> logits = std::nullopt, std::optional<FloatType> const &acceptanceThreshold = std::nullopt)

VecTokens getTokens() const

std::optional<Tensor> getLogits() const

std::optional<FloatType> getAcceptanceThreshold() const

Private Members

VecTokens mTokens: The draft tokens.

std::optional<Tensor> mLogits: The draft logits. Expected shape: [num_draft_tokens, vocab_size].

std::optional<FloatType> mAcceptanceThreshold: The acceptance threshold. Must be > 0.f and <= 1.f.

Friends

friend class Serialization

class PromptTuningConfig

#include <executor.h>

Configuration for prompt tuning.

Public Functions

explicit PromptTuningConfig(Tensor embeddingTable)

Tensor getEmbeddingTable() const

Private Members

Tensor mEmbeddingTable: The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.

Friends

friend class Serialization

class LoraConfig

#include <executor.h>

Configuration for LoRA.

Public Functions

explicit LoraConfig(IdType taskId, std::optional<Tensor> weights = std::nullopt, std::optional<Tensor> config = std::nullopt)

IdType getTaskId() const

std::optional<Tensor> getWeights() const

std::optional<Tensor> getConfig() const

Private Members

IdType mTaskId: The Lora task id.

std::optional<Tensor> mWeights: The Lora weights. See TRT-LLM documentation for expected shapes and types.

std::optional<Tensor> mConfig: The Lora configuration. See TRT-LLM documentation for detailed description of the config tensor.

Friends

friend class Serialization

class Request

#include <executor.h>

A class that holds information about the request.

Public Functions

Request(VecTokens inputTokenIds, SizeType maxNewTokens, bool streaming = false, SamplingConfig const &samplingConfig = SamplingConfig(), OutputConfig const &outputConfig = OutputConfig(), std::optional<SizeType> const &endId = std::nullopt, std::optional<SizeType> const &padId = std::nullopt, std::optional<std::list<VecTokens>> badWords = std::nullopt, std::optional<std::list<VecTokens>> stopWords = std::nullopt, std::optional<Tensor> embeddingBias = std::nullopt, std::optional<SpeculativeDecodingConfig> speculativeDecodingConfig = std::nullopt, std::optional<PromptTuningConfig> pTuningConfig = std::nullopt, std::optional<LoraConfig> loraConfig = std::nullopt, std::optional<std::string> logitsPostProcessorName = std::nullopt)

The Request constructor.

Parameters:

inputTokenIds – The input token ids
maxNewTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling configuration
outputConfig – The output configuration
endId – The end token id
padId – The pad token id
badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens
stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens
embeddingBias – The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]
speculativeDecodingConfig – The speculative decoding configuration
pTuningConfig – The prompt tuning configuration
loraConfig – The LoRA configuration
logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to the ExecutorConfig.

Request(Request const &other)

Request(Request &&other) noexcept

Request &operator=(Request const &other)

Request &operator=(Request &&other) noexcept

~Request()

VecTokens getInputTokenIds() const

SizeType getMaxNewTokens() const

bool getStreaming() const

SamplingConfig getSamplingConfig() const

OutputConfig getOutputConfig() const

std::optional<SizeType> getEndId() const

std::optional<SizeType> getPadId() const

std::optional<std::list<VecTokens>> getBadWords() const

std::optional<std::list<VecTokens>> getStopWords() const

std::optional<Tensor> getEmbeddingBias() const

std::optional<SpeculativeDecodingConfig> getSpeculativeDecodingConfig() const

std::optional<PromptTuningConfig> getPromptTuningConfig() const

std::optional<LoraConfig> getLoraConfig() const

std::optional<std::string> getLogitsPostProcessorName() const

void setStreaming(bool streaming)

void setSamplingConfig(SamplingConfig const &config)

void setOutputConfig(OutputConfig const &outputConfig)

void setEndId(SizeType endId)

void setPadId(SizeType padId)

void setBadWords(std::list<VecTokens> const &badWords)

void setStopWords(std::list<VecTokens> const &stopWords)

void setEmbeddingBias(Tensor const &embeddingBias)

void setSpeculativeDecodingConfig(SpeculativeDecodingConfig const &specDecodingConfig)

void setPromptTuningConfig(PromptTuningConfig const &pTuningConfig)

void setLoraConfig(LoraConfig const &loraConfig)

void setLogitsPostProcessorName(std::string const &logitsPostProcessorName)

Private Members

std::unique_ptr<Impl> mImpl

Friends

friend class Serialization

struct Result

#include <executor.h>

Struct that holds the generation result.

Public Members

bool isFinal: Indicates if this is the final result for the request.

BeamTokens outputTokenIds: The output tokens for each beam.

std::optional<VecLogProbs> cumLogProbs: The cumulative log probabilities. Size beamSize.

std::optional<std::vector<VecLogProbs>> logProbs: The log probabilities for each generated token. Size [beamSize, outputLen].

std::optional<Tensor> contextLogits: The context logits. Size [promptLen, vocabSizePadded].

std::optional<Tensor> generationLogits: The context logits. Size [beamSize, maxNewTokens, vocabSizePadded].

class Response

#include <executor.h>

Class that holds either an error or a result.

Public Functions

Response(IdType requestId, std::string errorMsg)

Response(IdType requestId, Result Result)

~Response()

Response(Response const &other)

Response(Response &&other) noexcept

Response &operator=(Response const &other)

Response &operator=(Response &&other) noexcept

IdType getRequestId() const: Get the id of the request for which this response was generated.

bool hasError() const: Indicates if this response has an error or not.

std::string getErrorMsg() const: Get the error msg for this response Will throw an exception if hasError is false.

Result getResult() const: Get the result for this response Will throw an exception if hasResult is true.

Private Members

std::unique_ptr<Impl> mImpl

class SchedulerConfig

#include <executor.h>

Configuration class for the scheduler.

Public Functions

explicit SchedulerConfig(SchedulerPolicy policy = SchedulerPolicy::kGUARANTEED_NO_EVICT)

SchedulerPolicy getPolicy() const

Private Members

SchedulerPolicy mPolicy: The scheduler policy. See SchedulerPolicy.

class KvCacheConfig

#include <executor.h>

Configuration class for the KV cache.

Public Functions

explicit KvCacheConfig(bool enableBlockReuse = false, std::optional<SizeType> const &maxTokens = std::nullopt, std::optional<SizeType> const &maxAttentionWindow = std::nullopt, std::optional<SizeType> const &sinkTokenLength = std::nullopt, std::optional<FloatType> const &freeGpuMemoryFraction = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt, bool onboardBlocks = true)

bool getEnableBlockReuse() const

std::optional<SizeType> getMaxTokens() const

std::optional<SizeType> getMaxAttentionWindow() const

std::optional<SizeType> getSinkTokenLength() const

std::optional<FloatType> getFreeGpuMemoryFraction() const

std::optional<size_t> getHostCacheSize() const

bool getOnboardBlocks() const

Private Members

bool mEnableBlockReuse: Controls if KV cache blocks can be reused for different requests.

std::optional<SizeType> mMaxTokens: The maximum number of tokens that should be stored in the KV cache If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.

std::optional<SizeType> mMaxAttentionWindow: Size of the attention window for each sequence. Only the last mMaxAttentionWindow tokens of each sequence will be stored in the KV cache.

std::optional<SizeType> mSinkTokenLength: Number of sink tokens (tokens to always keep in attention window)

std::optional<FloatType> mFreeGpuMemoryFraction: The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.

std::optional<size_t> mHostCacheSize: Size of secondary memory pool in bytes. Default is 0. Having a secondary memory pool increases KV cache block reuse potential.

bool mOnboardBlocks: Controls whether offloaded blocks should be onboarded back into primary memory before being reused.

class ParallelConfig

#include <executor.h>

A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.

Public Functions

explicit ParallelConfig(CommunicationType commType = CommunicationType::kMPI, CommunicationMode commMode = CommunicationMode::kLEADER, std::optional<std::vector<SizeType>> deviceIds = std::nullopt, std::optional<std::vector<SizeType>> participantIds = std::nullopt)

Constructor.

Parameters:

commType – The communication type. See CommunicationType.
commMode – The communication mode. See CommunicationMode.
deviceIds – The IDs of the GPUs involved in the execution of the model
participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.

CommunicationType getCommunicationType() const

CommunicationMode getCommunicationMode() const

std::optional<std::vector<SizeType>> getDeviceIds() const

std::optional<std::vector<SizeType>> getParticipantIds() const

void setCommunicationType(CommunicationType type)

void setCommunicationMode(CommunicationMode mode)

void setDeviceIds(std::vector<SizeType> const &deviceIds)

void setParticipantIds(std::vector<SizeType> const &participantIds)

Private Members

CommunicationType mCommType: The type of communication protocol used. Default is MPI.

CommunicationMode mCommMode: The mode of communication. See CommunicationMode.

std::optional<std::vector<SizeType>> mDeviceIds: The GPU device ids to use for executing this model.

std::optional<std::vector<SizeType>> mParticipantIds: The participant ids (MPI ranks for example) used for executing this model.

class PeftCacheConfig

#include <executor.h>

config for PeftCacheManager

Public Functions

explicit PeftCacheConfig(SizeType numHostModuleLayer = 0, SizeType numDeviceModuleLayer = 0, SizeType optimalAdapterSize = 8, SizeType maxAdapterSize = 64, SizeType numPutWorkers = 1, SizeType numEnsureWorkers = 1, SizeType numCopyStreams = 1, SizeType maxPagesPerBlockHost = 24, SizeType maxPagesPerBlockDevice = 8, std::optional<float> const &deviceCachePercent = std::nullopt, std::optional<size_t> const &hostCacheSize = std::nullopt)

SizeType getNumHostModuleLayer() const

SizeType getNumDeviceModuleLayer() const

SizeType getOptimalAdapterSize() const

SizeType getMaxAdapterSize() const

SizeType getNumPutWorkers() const

SizeType getNumEnsureWorkers() const

SizeType getNumCopyStreams() const

SizeType getMaxPagesPerBlockHost() const

SizeType getMaxPagesPerBlockDevice() const

std::optional<float> getDeviceCachePercent() const

std::optional<size_t> getHostCacheSize() const

Private Members

SizeType mNumHostModuleLayer

SizeType mNumDeviceModuleLayer

SizeType mOptimalAdapterSize

SizeType mMaxAdapterSize

SizeType mNumPutWorkers

SizeType mNumEnsureWorkers

SizeType mNumCopyStreams

SizeType mMaxPagesPerBlockHost

SizeType mMaxPagesPerBlockDevice

std::optional<float> mDeviceCachePercent

std::optional<size_t> mHostCacheSize

class ExecutorConfig

#include <executor.h>

Configuration class for the model executor.

Public Functions

explicit ExecutorConfig(SizeType maxBeamWidth = 1, SchedulerConfig const &schedulerConfig = SchedulerConfig(), KvCacheConfig const &kvCacheConfig = KvCacheConfig(), bool enableChunkedContext = false, bool normalizeLogProbs = true, SizeType iterStatsMaxIterations = kDefaultIterStatsMaxIterations, SizeType requestStatsMaxIterations = kDefaultRequestStatsMaxIterations, BatchingType batchingType = BatchingType::kINFLIGHT, std::optional<ParallelConfig> parallelConfig = std::nullopt, std::optional<PeftCacheConfig> const &peftCacheConfig = std::nullopt, std::optional<LogitsPostProcessorMap> logitsPostProcessorMap = std::nullopt, std::optional<MedusaChoices> medusaChoices = std::nullopt)

SizeType getMaxBeamWidth() const

SchedulerConfig getSchedulerConfig() const

KvCacheConfig getKvCacheConfig() const

bool getEnableChunkedContext() const

bool getNormalizeLogProbs() const

SizeType getIterStatsMaxIterations() const

SizeType getRequestStatsMaxIterations() const

BatchingType getBatchingType() const

std::optional<ParallelConfig> getParallelConfig() const

std::optional<PeftCacheConfig> getPeftCacheConfig() const

std::optional<LogitsPostProcessorMap> getLogitsPostProcessorMap() const

std::optional<MedusaChoices> getMedusaChoices() const

void setMaxBeamWidth(SizeType maxBeamWidth)

void setSchedulerConfig(SchedulerConfig const &schedulerConfig)

void setKvCacheConfig(KvCacheConfig const &kvCacheConfig)

void setEnableChunkedContext(bool enableChunkedContext)

void setNormalizeLogProbs(bool normalizeLogProbs)

void setIterStatsMaxIterations(SizeType iterStatsMaxIterations)

void setRequestStatsMaxIterations(SizeType requestStatsMaxIterations)

void setBatchingType(BatchingType batchingType)

void setParallelConfig(ParallelConfig const &parallelConfig)

void setPeftCacheConfig(PeftCacheConfig const &peftCacheConfig)

void setLogitsPostProcessorMap(LogitsPostProcessorMap const &logitsPostProcessorMap)

void setMedusaChoices(MedusaChoices const &medusaChoices)

Private Members

SizeType mMaxBeamWidth: The beam width value of requests that will be sent to the executor.

SchedulerConfig mSchedulerConfig: The scheduler configuration.

KvCacheConfig mKvCacheConfig: The KV cache configuration.

bool mEnableChunkedContext: The KV cache configuration.

bool mNormalizeLogProbs: Controls if log probabilities should be normalized or not.

SizeType mIterStatsMaxIterations: Controls the maximum number of iterations for which to keep statistics.

SizeType mRequestStatsMaxIterations: Controls the maximum number of iterations for which to keep per-request statistics.

BatchingType mBatchingType: The type of batching strategy to use. See BatchingType.

std::optional<ParallelConfig> mParallelConfig: The parallel execution configuration.

std::optional<PeftCacheConfig> mPeftCacheConfig

std::optional<LogitsPostProcessorMap> mLogitsPostProcessorMap

std::optional<MedusaChoices> mMedusaChoices

class Executor

#include <executor.h>

The executor is responsible for receiving new requests and sending responses, and running the inference.

Public Functions

Executor(std::filesystem::path const &modelPath, ModelType modelType, ExecutorConfig const &executorConfig)

Parameters:

modelPath – Path to the folder that defines the model to run
modelType – The type of model
executorConfig – The configuration for the executor
comm – An optional inter-process communicator configuration

Executor(std::vector<uint8_t> const &engineBuffer, std::string const &jsonConfigStr, ModelType modelType, ExecutorConfig const &executorConfig)

Executor(std::shared_ptr<Model> model, ExecutorConfig const &executorConfig)

~Executor()

IdType enqueueRequest(Request const &request)

Enqueue a new request.

Parameters:: request – The LLM request which contains input tokens and request parameters
Returns:: A unique id that identifies the request

std::vector<IdType> enqueueRequests(std::vector<Request> const &requests): Enqueue a batch of request.

std::vector<Response> awaitResponses(std::optional<IdType> const &requestId = std::nullopt, std::optional<std::chrono::milliseconds> const &timeout = std::nullopt)

Await for ready responses.

Parameters:

id – An optional request id. If not specified, responses for any request can be returned
timeout – The maximum time to wait for new responses

Returns:

A vector of responses

SizeType getNumResponsesReady(std::optional<IdType> const &requestId = std::nullopt) const

Get the number of ready responses.

Parameters:: requestId – An optional request id
Returns:: The number of ready responses

void cancelRequest(IdType requestId)

Cancel the request with provided request id.

Parameters:: id – The request id for which to cancel the response

void shutdown(): Signals the server to shutdown This call is blocking. Only returns when all requests have terminated or timeout has been reached.

std::deque<IterationStats> getLatestIterationStats()

Returns the per-iterations statistics computed since last call to getLatestIterationStats Contains at most iterStatsMaxIterations iterations.

Returns:: Iteration stats

std::deque<RequestStatsPerIteration> getLatestRequestStats()

Returns the request stats of each iteration computed since last call to getLatestRequestStats Contains at most requestStatsMaxIterations iterations.

Returns:: Request stats grouped by iterations

bool canEnqueueRequests() const: Indicates if the current process is allowed to enqueueRequests.

Private Members

std::unique_ptr<Impl> mImpl

class JsonSerialization

#include <executor.h>

Class with utility functions to serialize statistics to json string.

Public Static Functions

static std::string toJsonStr(IterationStats const &iterationStats): Utility function to convert an iterationStats struct to a json serialized string.

static std::string toJsonStr(RequestStatsPerIteration const &requestStatsPerIter): Utility function to convert a requestStatsPerIteration struct to a json serialized string.

static std::string toJsonStr(RequestStats const &requestStats): Utility function to convert a requestStats struct to a json serialized string.

tensor.h

namespace tensorrt_llm

namespace executor

class Shape : public tensorrt_llm::common::ArrayView<std::int32_t const>

Public Types

using Base = tensorrt_llm::common::ArrayView<std::int32_t const>

using DimType = typename std::remove_cv_t<Base::value_type>

Public Functions

inline Shape()

inline Shape(DimType const *data, Base::size_type size)

inline Shape(std::initializer_list<DimType> dims)

class Tensor

Public Types

using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>

Public Functions

Tensor copyToCpu(Tensor::CudaStreamPtr stream = nullptr) const

Tensor copyToPinned(Tensor::CudaStreamPtr stream = nullptr) const

Tensor copyToPooledPinned(Tensor::CudaStreamPtr stream = nullptr) const

Tensor copyToManaged(Tensor::CudaStreamPtr stream = nullptr) const

Tensor copyToGpu(Tensor::CudaStreamPtr stream) const

Tensor() noexcept = default

~Tensor() = default

Tensor(Tensor const &other) noexcept = default

Tensor(Tensor &&other) noexcept = default

Tensor &operator=(Tensor const &other) noexcept = default

Tensor &operator=(Tensor &&other) noexcept = default

void *getData(): Returns a pointer to underlying array.

void const *getData() const: Returns a pointer to underlying array.

DataType getDataType() const: Returns the data type of the buffer.

MemoryType getMemoryType() const: Returns the memory type of the buffer.

Shape getShape() const: Returns the tensor dimensions.

std::size_t getSize() const: Returns the number of elements in the tensor.

std::size_t getSizeInBytes() const: Returns the size of the tensor in bytes.

void setZero(CudaStreamPtr stream = nullptr)

Set the entire memory to zero.

Parameters:: stream – Must be a valid CUDA stream if the memory type is GPU.

void setFrom(Tensor const &other, CudaStreamPtr stream = nullptr)

Copy the data and shape from another tensor.

Parameters:

other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.

inline explicit operator bool() const

inline bool operator==(Tensor const &rhs) const

inline bool operator!=(Tensor const &rhs) const

Public Static Functions

static Tensor cpu(DataType dataType, Shape shape = {})

Allocate a cpu tensor with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typename T> static inline Tensor cpu(Shape shape = {})

static Tensor pinned(DataType dataType, Shape shape = {})

Allocate a cpu tensor in pinned memory with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typename T> static inline Tensor pinned(Shape shape = {})

static Tensor pooledPinned(DataType dataType, Shape shape = {})

Allocate a cpu tensor in pooled pinned memory with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typename T> static inline Tensor pooledPinned(Shape shape = {})

static Tensor managed(DataType dataType, Shape shape = {})

Allocate a tensor in managed memory (UVM) with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typename T> static inline Tensor managed(Shape shape = {})

static Tensor gpu(DataType dataType, CudaStreamPtr stream, Shape shape = {})

Allocate a gpu tensor with the given shape and data type on a particular cuda stream.

Parameters:

shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.

template<typename T> static inline Tensor gpu(CudaStreamPtr stream, Shape shape = {})

static Tensor of(DataType dataType, void *data, Shape shape)

Wrap a data pointer into a tensor without taking ownership.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

template<typename T> static inline Tensor of(T *data, Shape shape)

Wrap a data pointer into a tensor without taking ownership.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

template<typename T> static inline Tensor of(T &data)

Wrap any container into a tensor without taking ownership.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

Private Types

using Impl = runtime::ITensor 

Private Functions

explicit Tensor(std::shared_ptr<runtime::ITensor> tensor)

Tensor copyTo(std::shared_ptr<Impl> tensor, CudaStreamPtr stream) const

Private Members

std::shared_ptr<Impl> mTensor

Private Static Functions

template<typename T> static inline DataType getRuntimeType()

Friends

friend class Serialization

friend std::shared_ptr<runtime::ITensor> const &toITensor(Tensor const &tensor)

friend Tensor ofITensor(std::shared_ptr<runtime::ITensor> tensor)

namespace detail

Functions

std::shared_ptr<runtime::ITensor> const &toITensor(Tensor const &tensor)

Tensor ofITensor(std::shared_ptr<runtime::ITensor> tensor)

namespace runtime

types.h

template<> struct TypeTraits<std::int8_t>

Public Static Attributes

static constexpr auto value = DataType::kINT8 

template<> struct TypeTraits<std::int32_t>

Public Static Attributes

static constexpr auto value = DataType::kINT32 

template<> struct TypeTraits<std::int64_t>

Public Static Attributes

static constexpr auto value = DataType::kINT64 

template<> struct TypeTraits<std::uint8_t>

Public Static Attributes

static constexpr auto value = DataType::kUINT8 

namespace tensorrt_llm

namespace executor

Typedefs

using TensorPtr = std::shared_ptr<Tensor>

using SizeType = std::int32_t

using FloatType = float

using TokenIdType = std::int32_t

using VecTokens = std::vector<TokenIdType>

using BeamTokens = std::vector<VecTokens>

using IdType = std::uint64_t

using IterationType = std::uint64_t

using RandomSeedType = std::uint64_t

using VecLogProbs = std::vector<FloatType>

using StreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>

using LogitsPostProcessor = std::function<void(IdType, Tensor&, BeamTokens const&, StreamPtr&)>

using LogitsPostProcessorMap = std::unordered_map<std::string, LogitsPostProcessor>

using MedusaChoices = std::vector<std::vector<SizeType>>

Enums

enum class DataType

Values:

enumerator kBOOL

enumerator kUINT8

enumerator kINT8

enumerator kINT32

enumerator kINT64

enumerator kBF16

enumerator kFP8

enumerator kFP16

enumerator kFP32

enumerator kUNKNOWN

enum class MemoryType

Values:

enumerator kCPU

enumerator kCPU_PINNED

enumerator kGPU

enumerator kUVM

enumerator kUNKNOWN

enum class ModelType

Values:

enumerator kDECODER_ONLY

enum class BatchingType

The batching type.

Values:

enumerator kSTATIC: STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.

enumerator kINFLIGHT: INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.

enum class SchedulerPolicy

The policy used to select the subset of available requests in each iteration of the executor generation loop.

Values:

enumerator kMAX_UTILIZATION: MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.

enumerator kGUARANTEED_NO_EVICT: GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.

enum class CommunicationType

Values:

enumerator kMPI

enum class CommunicationMode

Values:

enumerator kLEADER

enum class RequestStage

Enum class that represents the state of a request.

Values:

enumerator kQUEUED: Request that have been received but not yet included in the active requests (due to constraints such as maximum batch size for example).

enumerator kCONTEXT_IN_PROGRESS: Active request in context phase.

enumerator kGENERATION_IN_PROGRESS: Active request in generation phase.

enumerator kGENERATION_COMPLETE: Active request for which generation has completed.

template<typename T, bool = false> struct TypeTraits: #include <types.h>

For converting a C++ data type to a TrtLmmDataType.

template<> struct TypeTraits<float>

Public Static Attributes

static constexpr auto value = DataType::kFP32 

template<> struct TypeTraits<half>

Public Static Attributes

static constexpr auto value = DataType::kFP16 

template<> int8_t >

Public Static Attributes

static constexpr auto value = DataType::kINT8 

template<> int32_t >

Public Static Attributes

static constexpr auto value = DataType::kINT32

template<> int64_t >

Public Static Attributes

static constexpr auto value = DataType::kINT64

template<> struct TypeTraits<bool>

Public Static Attributes

static constexpr auto value = DataType::kBOOL 

template<> uint8_t >

Public Static Attributes

static constexpr auto value = DataType::kUINT8

template<typename T> struct TypeTraits<T*>

Public Static Attributes

static constexpr auto value = DataType::kINT64 

struct KvCacheStats

#include <types.h>

Struct that holds the stats of a KV cache manager.

Public Members

SizeType maxNumBlocks: Max number of blocks.

SizeType freeNumBlocks: Number of free blocks.

SizeType usedNumBlocks: Number of used blocks.

SizeType tokensPerBlock: Number of tokens per block.

struct StaticBatchingStats

#include <types.h>

Struct that holds the stats of static batching models for a single iteration.

Public Members

SizeType numScheduledRequests: Number of scheduled requests.

SizeType numContextRequests: Number of requests in context stage.

SizeType numCtxTokens: Total number of context tokens in the iteration.

SizeType numGenTokens: Total number of tokens to generate in the iteration.

SizeType emptyGenSlots: Total number of unused generation token slots.

struct InflightBatchingStats

#include <types.h>

Struct that holds the stats of inflight batching models for a single iteration.

Public Members

SizeType numScheduledRequests: Number of scheduled requests.

SizeType numContextRequests: Number of requests in context stage.

SizeType numGenRequests: Number of requests in generation stage.

SizeType numPausedRequests: Number of paused requests.

SizeType numCtxTokens: Total number of context tokens in the iteration.

SizeType microBatchId: Index of mirco batch.

struct IterationStats

#include <types.h>

Struct that holds the stats of a single iteration.

Public Members

std::string timestamp: Ending time of this iteration.

IterationType iter: Iteration id.

SizeType numActiveRequests: Number of active requests.

SizeType maxNumActiveRequests: Number of max active requests.

size_t gpuMemUsage: GPU memory usage in bytes.

size_t cpuMemUsage: CPU memory usage in bytes.

size_t pinnedMemUsage: Pinned memory usage in bytes.

std::optional<KvCacheStats> kvCacheStats: Stats specific to KV caches.

std::optional<StaticBatchingStats> staticBatchingStats: Stats specific to static batching.

std::optional<InflightBatchingStats> inflightBatchingStats: Stats specific to inflight batching.

struct RequestStats

#include <types.h>

Struct that holds the stats of a single request.

Public Members

IdType id: The request id.

RequestStage stage: The current stage the request is in.

SizeType contextPrefillPosition: If using chunked context, the current context prefill position.

SizeType numGeneratedTokens: The number of generated tokens so far.

bool scheduled: Whether the request is scheduled for the current iteration.

bool paused: Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)

struct RequestStatsPerIteration

#include <types.h>

Struct that holds the stats of all requests in an iteration.

Public Members

IterationType iter: The iteration id for these stats.

std::vector<RequestStats> requestStats: The stats of all active requests for this iteration.

namespace runtime