Runtime
bufferManager.h
-
namespace tensorrt_llm
-
namespace runtime
-
class BufferManager
- #include <bufferManager.h>
A helper class for managing memory on host and device.
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
explicit BufferManager(CudaStreamPtr stream)
Construct a BufferManager.
- Parameters:
cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
-
IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBufferof the given size on the GPU.
-
ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensorof the given dimensions on the GPU.
-
IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBufferof the given size in UVM.
-
ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensorof the given dimensions in UVM.
-
IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBufferof the given size and memory type.
-
ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensorof the given dimensions and memory type.
-
inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
IBufferof the given memory type. It may be resized later.
-
inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
ITensorof the given memory type. It may be reshaped later.
-
void copy(void const *src, IBuffer &dst, MemoryType srcType) const
Copy
srctodst.
-
void copy(IBuffer const &src, void *dst, MemoryType dstType) const
Copy
srctodst.
-
IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const
Copy
srcinto a newIBufferwith a potentially different memory type.
-
ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const
Copy
srcinto a newITensorwith a potentially different memory type.
-
template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const Copy
srcinto a newIBufferwith a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
srcinto a newITensorwith a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
srcinto a newITensorwith a potentially different memory type.
-
CudaStream const &getStream() const
Get the underlying cuda stream.
-
std::size_t memoryPoolReserved() const
The current size of the memory reserved by the memory pool.
-
std::size_t memoryPoolUsed() const
The current size of the memory used by the memory pool.
-
std::size_t memoryPoolFree() const
The current size of the memory free in the memory pool.
-
void memoryPoolTrimTo(std::size_t size)
Try to trim the memory reserved by the pool to
sizebytes. This synchronizes implicitly with the stream.
Public Static Functions
-
static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBufferof the given size on the CPU.
-
static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensorof the given dimensions on the CPU.
-
static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
IBufferof the given size on the CPU.
-
static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
ITensorof the given dimensions on the CPU.
-
static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
IBufferof the given size on the CPU in the default memory pool.
-
static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
ITensorof the given dimensions on the CPU in the default memory pool.
Private Members
-
CudaStreamPtr mStream
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class BufferManager
-
namespace runtime
common.h
-
namespace tensorrt_llm
-
namespace runtime
-
namespace runtime
cudaEvent.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaEvent
Public Types
-
using pointer = cudaEvent_t
Public Functions
-
inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)
Creates a new cuda event. The event will be destroyed in the destructor.
- Parameters:
flags – Flags for event creation. By default, event timing is disabled.
-
inline explicit CudaEvent(pointer event, bool ownsEvent = true)
Pass an existing cuda event to this object.
- Parameters:
event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
-
inline void synchronize() const
Synchronizes the event.
Private Types
-
using EventPtr = std::unique_ptr<element_type, Deleter>
-
using pointer = cudaEvent_t
-
class CudaEvent
-
namespace runtime
cudaStream.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaStream
Public Functions
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
- Parameters:
flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
-
inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)
Pass an existing cuda stream to this object.
- Parameters:
stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
-
inline int getDevice() const
Returns the device on which the stream was created.
-
inline cudaStream_t get() const
Returns the stream associated with this object.
-
inline void synchronize() const
Synchronizes the stream.
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
-
class CudaStream
-
namespace runtime
decodingInput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class DecodingInput
-
Public Functions
Public Members
-
class DecodingInput
-
namespace runtime
decodingOutput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class DecodingOutput
-
Public Members
-
BeamHypotheses beamHypotheses
Public Static Attributes
-
static constexpr float kNegativeInfinity = -1e20f
-
class BeamHypotheses
Public Functions
-
void empty(BufferManager &manager)
-
void release()
-
void init(BufferManager &manager, TokenIdType endId)
-
BeamHypotheses slice(SizeType batchIndex, SizeType size) const
-
void empty(BufferManager &manager)
-
BeamHypotheses beamHypotheses
-
class DecodingOutput
-
namespace runtime
generationInput.h
-
namespace tensorrt_llm
-
namespace runtime
-
template<typename TTensor, typename PromptTuningParams>
class GenericGenerationInput -
Public Functions
-
class GenerationInput : public tensorrt_llm::runtime::GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
Public Types
-
using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
-
using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
-
template<typename TTensor, typename PromptTuningParams>
-
namespace runtime
generationOutput.h
-
namespace tensorrt_llm
-
namespace runtime
-
template<typename TTensor>
class GenericGenerationOutput Public Types
-
class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<ITensor::SharedPtr>
Public Types
-
using Base = GenericGenerationOutput<ITensor::SharedPtr>
-
using Base = GenericGenerationOutput<ITensor::SharedPtr>
-
template<typename TTensor>
-
namespace runtime
gptDecoder.h
-
namespace tensorrt_llm
-
namespace layers
-
namespace runtime
-
class IGptDecoder
Subclassed by tensorrt_llm::runtime::GptDecoder< T >
Public Functions
-
virtual ~IGptDecoder() = default
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) = 0
-
virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
-
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) = 0
-
virtual const SamplingConfig &getSamplingConfig() = 0
Public Static Functions
-
static void acceptDraftTokensByIds(const ITensor &targetTokenIds, const ITensor &draftTokenIds, const ITensor &contextLengths, const ITensor &numDraftTokens, ITensor &sequenceLengths, const ITensor &finishedVec, ITensor &finishedFinal, ITensor &finishedSum, BufferManager::CudaStreamPtr const &stream)
-
static void acceptDraftTokensByLogits(ITensor &draftLogits, const ITensor &targetLogits, ITensor &draftProbs, ITensor &targetProbs, const ITensor &numDraftTokens, ITensor &finished, SizeType vocabSize, SizeType vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, curandState_t *curandState, BufferManager::CudaStreamPtr const &stream)
-
static inline std::unique_ptr<IGptDecoder> create(nvinfer1::DataType dtype, size_t maxBatchSize, size_t vocabSize, size_t vocabSizePadded, BufferManager::CudaStreamPtr const &stream)
-
virtual ~IGptDecoder() = default
-
template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder Public Types
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
Public Functions
-
GptDecoder(size_t maxBatchSize, size_t vocabSize, size_t vocabSizePadded, CudaStreamPtr const &stream)
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize, SizeType maxSequenceLength) override
-
virtual bool forward(DecodingOutput &output, DecodingInput const &input) override
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
-
virtual void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager) override
-
inline virtual const SamplingConfig &getSamplingConfig() override
Private Members
-
BufferManager mManager
-
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
-
SamplingConfig mSamplingConfig
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
-
class IGptDecoder
-
namespace layers
gptDecoderBatch.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
- #include <gptDecoderBatch.h>
GPT decoder class with support for in-flight batching.
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)
-
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, nvinfer1::DataType dtype) override
Setup the decoder before calling
forward()
-
virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) override
Initialize the decoder at
batchIdxwith a newrequest.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) override
Initialize the decoder with new batch of inputs.
-
virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::Token const &e) override
Wait for the call to
forwardAsyncassociated with a token to complete.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() override
Wait for the last call to
forwardAsyncto complete.
-
inline virtual std::vector<bool> getFinished() const override
- Returns:
[batchSize], indicators of finished requests
-
inline virtual TensorPtr getOutputIds(SizeType batchIdx) const override
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx, on gpu
-
inline virtual TensorPtr getOutputIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu
-
virtual CudaEvent finalize(SizeType batchIdx) const
Gather final beam search results for request
batchIdx. Result will only be available after event returned.
-
virtual void finalize() const override
Gather final beam search results for all requests.
-
inline virtual TensorPtr getParentIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-
inline virtual TensorPtr getCumLogProbs() const override
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getCumLogProbs(SizeType batchIdx) const
- Returns:
[maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs() const override
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getLogProbs(SizeType batchIdx) const
- Returns:
[maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
inline virtual TensorPtr getAllNewTokens() const override
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
inline virtual TensorPtr getNewTokens(SizeType iter = 0) const override
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter(per beam), on gpu
Private Types
-
using GptDecoderPtr = std::unique_ptr<IGptDecoder>
-
using DecodingInputPtr = std::unique_ptr<DecodingInput>
-
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>
Private Functions
Private Members
-
std::size_t const mVocabSize
-
std::size_t const mVocabSizePadded
-
CudaStreamPtr mStream
-
BufferManager mBufferManager
-
TokenPtr mForwardToken
-
std::vector<CudaStreamPtr> mStreams
-
std::vector<GptDecoderPtr> mDecoders
-
std::vector<DecodingInputPtr> mDecodingInputs
-
std::vector<DecodingOutputPtr> mDecodingOutputs
-
DecodingInputPtr mJointDecodingInput
-
DecodingOutputPtr mJointDecodingOutput
-
std::vector<bool> mAcceptByLogits
-
std::vector<bool> mFinished
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
-
namespace runtime
gptJsonConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptJsonConfig
Public Functions
-
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)
-
inline GptModelConfig getModelConfig() const
-
inline std::string const &getName() const
-
inline std::string const &getVersion() const
-
inline std::string const &getPrecision() const
-
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
-
inline std::string engineFilename(WorldConfig const &worldConfig) const
Public Static Functions
-
static GptJsonConfig parse(std::string const &json)
-
static GptJsonConfig parse(std::istream &json)
-
static GptJsonConfig parse(std::filesystem::path const &path)
-
inline GptJsonConfig(std::string name, std::string version, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)
-
class GptJsonConfig
-
namespace runtime
gptModelConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptModelConfig
-
Public Functions
-
inline explicit GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
-
inline constexpr bool useGptAttentionPlugin() const noexcept
-
inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
-
inline constexpr bool usePackedInput() const noexcept
-
inline constexpr void usePackedInput(bool inputPacked) noexcept
-
inline constexpr bool usePagedKvCache() const noexcept
-
inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept
-
inline constexpr common::QuantMode getQuantMode() const noexcept
-
inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept
-
inline constexpr bool supportsInflightBatching() const noexcept
-
inline constexpr bool usePromptTuning() const noexcept
-
inline constexpr void setMaxPromptEmbeddingTableSize(SizeType maxPromptEmbeddingTableSize) noexcept
-
inline constexpr bool computeContextLogits() const noexcept
-
inline constexpr void computeContextLogits(bool computeContextLogits) noexcept
-
inline constexpr bool computeGenerationLogits() const noexcept
-
inline constexpr void computeGenerationLogits(bool computeGenerationLogits) noexcept
-
inline ModelVariant getModelVariant() const
-
inline void setModelVariant(ModelVariant modelVariant)
-
inline constexpr bool useCustomAllReduce() const noexcept
-
inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept
-
inline constexpr void setUseContextFMHAForGeneration(bool useContextFMHAForGeneration) noexcept
-
inline constexpr bool getContextFMHAForGeneration() const noexcept
-
inline constexpr void setPagedContextFMHA(bool pagedContextFMHA) noexcept
-
inline constexpr bool getPagedContextFMHA() const noexcept
-
inline constexpr bool useLoraPlugin() const noexcept
-
inline constexpr void useLoraPlugin(bool useLoraPlugin) noexcept
-
inline std::vector<LoraModule> const &getLoraModules() const noexcept
-
inline void setLoraModules(std::vector<LoraModule> const &loraModules) noexcept
Private Members
-
bool mUseGptAttentionPlugin
-
bool mInputPacked
-
bool mPagedKvCache
-
common::QuantMode mQuantMode
-
bool mComputeContextLogits
-
bool mComputeGenerationLogits
-
ModelVariant mModelVariant
-
bool mUseCustomAllReduce
-
bool mUseContextFMHAForGeneration
-
bool mPagedContextFMHA
-
bool mUseLoraPlugin
-
std::vector<LoraModule> mLoraModules
-
inline explicit GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
-
class GptModelConfig
-
namespace runtime
gptSession.h
-
namespace tensorrt_llm
-
-
namespace runtime
-
class GptSession
-
Public Functions
-
GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
-
inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
-
inline GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
-
BufferManager const &getBufferManager() const
-
inline GptModelConfig const &getModelConfig() const
-
inline WorldConfig const &getWorldConfig() const
-
inline int getDevice() const noexcept
-
void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)
Private Types
-
using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager
-
using KvCacheConfig = batch_manager::kv_cache_manager::KvCacheConfig
Private Functions
-
inline bool useCudaGraphs()
-
void generateBatched(std::vector<GenerationOutput> µBatchesOutputs, std::vector<GenerationInput> const µBatchesInputs, SamplingConfig const &samplingConfig, TokenGeneratedCallback const &onTokenGenerated)
-
void createContexts()
-
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches)
-
void createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, KvCacheConfig const &config)
-
void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
-
void executeContextStep(std::vector<GenerationInput> const &generationBatchesInputs, std::vector<SizeType> const &generationBatchesOffsets, KvCacheManager const *kvCacheManager)
-
SizeType executeGenerationStep(SizeType step, std::vector<GenerationInput> const µBatchesInputs, std::vector<GenerationOutput> µBatchesOutputs, std::vector<SizeType> const µBatchOffsets, KvCacheManager *kvCacheManager, std::vector<bool> µBatchesFinished)
-
void decoderStepAsync(SizeType decoderStep, SizeType microBatchId)
Execute decoder on last PP rank, receive decoder output on other PP ranks.
-
bool shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId)
Synchronize with the decoder and return the
shouldStopflag.
-
void finalize(SizeType microBatchId)
Collect final output ids and log probs on last PP rank and send them to first PP rank.
Receives are asynchronous on host, so synchronization is required before access.
-
ITensor::SharedPtr initDecoder(ITensor &outputIds, GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig, SizeType microBatchId) const
Populate outputIds and return reference to newTokens tensor.
-
TokenGeneratedCallback createOnTokenGeneratedCallback(GenerationOutput &outputs)
Private Members
-
GptModelConfig const mModelConfig
-
WorldConfig const mWorldConfig
-
int mDevice = {-1}
-
std::shared_ptr<NcclCommunicator> mPipelineComm
-
std::shared_ptr<CudaStream> mCommStream
-
std::shared_ptr<TllmRuntime> mRuntime
-
std::shared_ptr<KvCacheManager> mKvCacheManager
-
MicroBatchConfig mMicroBatchConfig
-
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
-
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
-
bool mCudaGraphMode = {false}
-
std::vector<CudaGraphExecutor> mCudaGraphInstances
Friends
- friend class batch_manager::TrtGptModelV1
-
class Config
- #include <gptSession.h>
Configuration for session execution and buffer sizes.
generatemay be called with batch size and beam width smaller than the configured parameters.maxBatchSizewill be divided by the number of micro batches to initialize each batch buffer.Public Functions
Public Members
-
bool decoderPerRequest = {false}
-
bool cudaGraphMode = {false}
-
KvCacheConfig kvCacheConfig = {}
-
bool decoderPerRequest = {false}
-
class CudaGraphExecutor
Public Functions
-
CudaGraphExecutor() = default
-
inline ~CudaGraphExecutor()
-
inline bool hasInstance()
-
void clear()
-
void launch(CudaStream const &stream)
Private Functions
-
void create(cudaGraph_t const &graph)
-
bool update(cudaGraph_t const &graph)
-
void uploadToStream(CudaStream const &stream)
Private Members
-
cudaGraphExec_t mInstance
-
CudaGraphExecutor() = default
-
class MicroBatchConfig
-
GptSession(Config const &sessionConfig, GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
-
class GptSession
-
namespace runtime
iBuffer.h
-
template<>
struct MemoryTypeString<MemoryType::kGPU> Public Static Attributes
-
static constexpr auto value = "GPU"
-
static constexpr auto value = "GPU"
-
template<>
struct MemoryTypeString<MemoryType::kCPU> Public Static Attributes
-
static constexpr auto value = "CPU"
-
static constexpr auto value = "CPU"
-
template<>
struct MemoryTypeString<MemoryType::kPINNED> Public Static Attributes
-
static constexpr auto value = "PINNED"
-
static constexpr auto value = "PINNED"
-
template<>
struct MemoryTypeString<MemoryType::kUVM> Public Static Attributes
-
static constexpr auto value = "UVM"
-
static constexpr auto value = "UVM"
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32> Public Types
-
using type = std::int32_t
-
using type = std::int32_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64> Public Types
-
using type = std::int64_t
-
using type = std::int64_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT32, true> Public Types
-
using type = std::uint32_t
-
using type = std::uint32_t
-
template<>
struct DataTypeTraits<nvinfer1::DataType::kINT64, true> Public Types
-
using type = std::uint64_t
-
using type = std::uint64_t
-
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kBOOL, kUnsigned> Public Types
-
using type = bool
-
using type = bool
-
template<bool kUnsigned>
struct DataTypeTraits<nvinfer1::DataType::kUINT8, kUnsigned> Public Types
-
using type = std::uint8_t
-
using type = std::uint8_t
-
template<>
struct TRTDataType<std::int8_t>
-
template<>
struct TRTDataType<std::int32_t>
-
template<>
struct TRTDataType<std::uint32_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
template<>
struct TRTDataType<std::int64_t>
-
template<>
struct TRTDataType<std::uint64_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
template<>
struct TRTDataType<std::uint8_t>
-
namespace tensorrt_llm
-
namespace runtime
Typedefs
Enums
Functions
-
template<MemoryType T>
struct MemoryTypeString
- template<> kGPU >
Public Static Attributes
-
static constexpr auto value = "GPU"
-
static constexpr auto value = "GPU"
- template<> kCPU >
Public Static Attributes
-
static constexpr auto value = "CPU"
-
static constexpr auto value = "CPU"
- template<> kPINNED >
Public Static Attributes
-
static constexpr auto value = "PINNED"
-
static constexpr auto value = "PINNED"
- template<> kUVM >
Public Static Attributes
-
static constexpr auto value = "UVM"
-
static constexpr auto value = "UVM"
-
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct DataTypeTraits - #include <iBuffer.h>
For converting a TensorRT data type to a C++ data type.
- template<> kFLOAT >
Public Types
-
using type = float
-
using type = float
- template<> kHALF >
Public Types
-
using type = half
Public Static Attributes
-
static constexpr char name[] = "half"
-
static constexpr auto size = sizeof(type)
-
using type = half
- template<> kINT8 >
Public Types
-
using type = std::int8_t
Public Static Attributes
-
static constexpr char name[] = "int8"
-
static constexpr auto size = sizeof(type)
-
using type = std::int8_t
- template<> kINT32 >
Public Types
-
using type = std::int32_t
Public Static Attributes
-
static constexpr char name[] = "int32"
-
static constexpr auto size = sizeof(type)
-
using type = std::int32_t
- template<> kINT64 >
Public Types
-
using type = std::int64_t
Public Static Attributes
-
static constexpr char name[] = "int64"
-
static constexpr auto size = sizeof(type)
-
using type = std::int64_t
- template<> kINT32, true >
Public Types
-
using type = std::uint32_t
Public Static Attributes
-
static constexpr char name[] = "uint32"
-
static constexpr auto size = sizeof(type)
-
using type = std::uint32_t
- template<> kINT64, true >
Public Types
-
using type = std::uint64_t
Public Static Attributes
-
static constexpr char name[] = "uint64"
-
static constexpr auto size = sizeof(type)
-
using type = std::uint64_t
- template<bool kUnsigned> kBOOL, kUnsigned >
Public Types
-
using type = bool
Public Static Attributes
-
static constexpr char name[] = "bool"
-
static constexpr auto size = sizeof(type)
-
using type = bool
- template<bool kUnsigned> kUINT8, kUnsigned >
Public Types
-
using type = std::uint8_t
Public Static Attributes
-
static constexpr char name[] = "uint8"
-
static constexpr auto size = sizeof(type)
-
using type = std::uint8_t
-
template<nvinfer1::DataType kDataType, bool kUnsigned>
struct DataTypeTraits<kDataType, kUnsigned, true>
-
class BufferDataType
- #include <iBuffer.h>
A wrapper around
nvinfer1::DataTypethat provides a support for pointer types.
-
template<typename T, bool = false>
struct TRTDataType - #include <iBuffer.h>
For converting a C++ data type to a TensorRT data type.
-
template<>
struct TRTDataType<float>
-
template<>
struct TRTDataType<half>
- template<> int8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT8
-
static constexpr auto value = nvinfer1::DataType::kINT8
- template<> int32_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT32
-
static constexpr auto value = nvinfer1::DataType::kINT32
- template<> uint32_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
- template<> int64_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT64
-
static constexpr auto value = nvinfer1::DataType::kINT64
- template<> uint64_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
template<>
struct TRTDataType<bool>
- template<> uint8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kUINT8
-
static constexpr auto value = nvinfer1::DataType::kUINT8
-
template<>
struct TRTDataType<void*> Public Static Attributes
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
template<typename T>
struct TRTDataType<T*> Public Static Attributes
-
static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
Private Static Attributes
-
static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}
-
static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
-
class IBuffer
Subclassed by tensorrt_llm::runtime::ITensor
Public Types
Public Functions
-
virtual void *data() = 0
Returns a pointer to underlying array.
-
virtual void const *data() const = 0
Returns a pointer to underlying array.
-
inline virtual void *data(std::size_t index)
Returns a pointer to the underlying array at a given element index.
-
inline virtual void const *data(std::size_t index) const
Returns a pointer to the underlying array at a given element index.
-
virtual std::size_t getSize() const = 0
Returns the size (in number of elements) of the buffer.
-
inline virtual std::size_t getSizeInBytes() const
Returns the size (in bytes) of the buffer.
-
virtual std::size_t getCapacity() const = 0
Returns the capacity of the buffer.
-
virtual char const *getDataTypeName() const
-
virtual MemoryType getMemoryType() const = 0
Returns the memory type of the buffer.
-
virtual char const *getMemoryTypeName() const
-
virtual void resize(std::size_t newSize) = 0
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
virtual void release() = 0
Releases the buffer. It will be reset to nullptr.
-
virtual ~IBuffer() = default
Public Static Functions
Creates a sliced view on the underlying
buffer. The view will have the same data type asbuffer.- Parameters:
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
- Returns:
A view on the
buffer.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
tensorwhich can be independently resized.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor.
Returns a view on the underlying
tensorwith a different size.- Parameters:
tensor – The tensor to view.
size – The size of the view.
- Returns:
A view on the
tensor.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
-
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)
Wraps the given
datain anIBuffer. TheIBufferwill not own the underlyingdataand cannot be resized beyondcapacity.- Parameters:
data – The data to wrap.
type – The data type of the
data.size – The size of the buffer.
capacity – The capacity of the buffer.
- Returns:
An
IBuffer.
-
static MemoryType memoryType(void const *data)
Determine the memory type of a pointer.
-
virtual void *data() = 0
-
template<typename T>
class BufferRange Public Types
-
using size_type = std::size_t
-
using reference = value_type&
-
using const_reference = value_type const&
-
using const_iterator = const_pointer
Public Functions
-
inline const_iterator begin() const
-
inline const_iterator end() const
-
inline const_iterator cbegin()
-
inline const_iterator cend()
-
inline const_iterator cbegin() const
-
inline const_iterator cend() const
-
inline const_reference operator[](size_type index) const
-
using size_type = std::size_t
-
template<MemoryType T>
-
namespace runtime
iGptDecoderBatch.h
-
namespace tensorrt_llm
-
namespace runtime
-
class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
- #include <iGptDecoderBatch.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::GptDecoderBatch
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
using TokenPtr = std::unique_ptr<decoder_batch::Token const>
Public Functions
-
virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) = 0
Initialize the decoder at
batchIdxwith a newrequest.
-
virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::Token const &token) = 0
Wait for the call to
forwardAsyncassociated with a token to complete.
-
inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)
Run one step for all requests and wait for completion on the host.
-
virtual TensorPtr getOutputIds(SizeType batchIdx) const = 0
- Parameters:
batchIdx – index of the batch
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx, on gpu
-
virtual CudaEvent finalize(SizeType batchIdx) const = 0
Gather final beam search results for request
batchIdx. Result will only be available after event returned.
-
virtual std::vector<bool> getFinished() const = 0
- Returns:
[batchSize (actual)], marks finished requests (per batch)
-
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, beamWidth], cumulative log probabilities (per beam), on gpu
-
virtual TensorPtr getCumLogProbs(SizeType batchIdx) const = 0
- Returns:
[beamWidth], cumulative log probabilities (per beam) for request batchIdx, on gpu
-
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, beamWidth, maxSeqLen], log probabilities (per beam), on gpu
Protected Functions
-
IGptDecoderBatch() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
namespace decoder_batch
-
-
class Request
Public Types
-
using ConstTensorPtr = ITensor::SharedConstPtr
Public Functions
-
inline explicit Request(ConstTensorPtr ids, SizeType inputLen, std::optional<SizeType> maxNewTokens = std::nullopt, std::optional<SizeType> endId = std::nullopt)
-
using ConstTensorPtr = ITensor::SharedConstPtr
-
class Input
-
Public Functions
-
inline explicit Input(std::vector<TensorConstPtr> const &logits, std::vector<bool> const &active)
-
inline explicit Input(std::vector<TensorConstPtr> const &logits)
Public Members
-
std::vector<TensorConstPtr> logits
-
std::vector<bool> active
-
TensorConstPtr cacheIndirection
-
inline explicit Input(std::vector<TensorConstPtr> const &logits, std::vector<bool> const &active)
-
class Token
-
class Request
-
class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
-
namespace runtime
iStatefulGptDecoder.h
-
namespace tensorrt_llm
-
namespace runtime
-
class IStatefulGptDecoder
- #include <iStatefulGptDecoder.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::IGptDecoderBatch
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, SizeType maxTokensPerStep, nvinfer1::DataType dtype) = 0
Setup the decoder before calling
forward(), also calls reshapeBuffers.
-
virtual void newBatch(GenerationInput const &inputs, GenerationOutput const &outputs, SamplingConfig const &samplingConfig) = 0
Initialize the decoder with new batch of inputs.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0
Run one step for all requests without blocking the host thread.
-
virtual void forwardSync() = 0
Wait for the last call to
forwardAsyncto complete.
-
inline virtual void forward(decoder::Output &output, decoder::Input const &input)
Run one step for all requests.
-
virtual void finalize() const = 0
Gather final beam search results for all requests.
-
virtual TensorPtr getOutputIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
-
virtual TensorPtr getCumLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
virtual TensorPtr getLogProbs() const = 0
- Returns:
[batchSize, maxBeamWidth, maxSequenceLength], log probabilities (per beam), on gpu
-
virtual TensorPtr getNewTokens(SizeType iter = 0) const = 0
Get tokens generated in one step of last forward pass.
- Parameters:
iter – The iteration within [0; maxTokensPerStep) for which to get the tokens
- Returns:
[batchSize, beamWidth], tokens generated in
iter(per beam), on gpu
-
virtual TensorPtr getAllNewTokens() const = 0
Get maxTokensPerStep tokens generated in the last forward pass.
- Returns:
[maxTokensPerStep, batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
-
virtual TensorPtr getNbFinished() const = 0
- Returns:
[1], number of finished sequences, in pinned host memory
-
virtual ~IStatefulGptDecoder() = default
Protected Functions
-
IStatefulGptDecoder() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class IStatefulGptDecoder
-
namespace runtime
iTensor.h
-
namespace nvinfer1
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
Public Types
Public Functions
-
~ITensor() override = default
-
virtual void reshape(Shape const &dims) = 0
Sets the tensor dimensions. The new size of the tensor will be
volume(dims)
-
inline virtual void resize(std::size_t newSize) override
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
Public Static Functions
-
static inline std::int64_t volume(Shape const &dims)
Returns the volume of the dimensions. Returns -1 if
d.nbDims < 0.
-
static inline std::size_t volumeNonNegative(Shape const &shape)
Returns the volume of the dimensions. Throws if
d.nbDims < 0.
-
static Shape squeeze(Shape const &shape, SizeType dim)
Removes the given unit dimension from
shape.- Parameters:
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
- Returns:
A new shape without the unit dimension.
-
static Shape unsqueeze(Shape const &shape, SizeType dim)
Add a unit dimension to
shapeat the specified position.- Parameters:
shape – The shape to unsqueeze.
dim – The dimension where unit dimension should be added.
- Returns:
A new shape with the added unit dimension.
Creates a sliced view on the underlying
tensor. The view will have the same data type astensor.- Parameters:
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
- Returns:
A view on the
buffer.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
buffer(or tensor) with the given shape.- Parameters:
tensor – The tensor to view.
shape – The shape of the view.
- Returns:
A view on the
tensor.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
Returns a view on the underlying
tensorwhich can be independently reshaped.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor.
-
static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)
Wraps the given
datain anITensor. TheITensorwill not own the underlyingdataand cannot be reshaped beyondcapacity.- Parameters:
data – The data to wrap.
type – The data type of the
data.shape – The shape of the tensor.
capacity – The capacity of the buffer.
- Returns:
An
ITensor.
-
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
-
static Shape makeShape(std::initializer_list<SizeType> const &dims)
A convenience function to create a tensor shape with the given dimensions.
-
static std::string toString(Shape const &dims)
A convenience function for converting a tensor shape to a
string.
Protected Functions
-
ITensor() = default
-
~ITensor() override = default
-
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
-
namespace runtime
ipcUtils.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
void setPeerAccess(WorldConfig const &worldConfig, bool enable = true)
-
class IpcMemory
-
Public Functions
-
IpcMemory(WorldConfig const &worldConfig, std::size_t bufferSize)
-
~IpcMemory()
-
inline const std::vector<void*> &getCommPtrsTensor() const
Public Static Attributes
-
static constexpr size_t FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t)
Private Members
-
WorldConfig mWorldConfig
-
std::vector<void*> mCommPtrs
-
std::size_t mBufferSize
-
void *mBufferPtr = {nullptr}
-
IpcMemory(WorldConfig const &worldConfig, std::size_t bufferSize)
-
void setPeerAccess(WorldConfig const &worldConfig, bool enable = true)
-
namespace runtime
memoryCounters.h
-
namespace tensorrt_llm
-
namespace runtime
-
class MemoryCounters
-
Public Functions
-
MemoryCounters() = default
-
template<MemoryType T>
inline void allocate(SizeType size)
-
void allocate(MemoryType memoryType, SizeType size)
-
template<MemoryType T>
inline void deallocate(SizeType size)
-
void deallocate(MemoryType memoryType, SizeType size)
-
std::string toString() const
Public Static Functions
-
static inline MemoryCounters &getInstance()
-
MemoryCounters() = default
-
class MemoryCounters
-
namespace runtime
promptTuningParams.h
-
namespace tensorrt_llm
-
namespace runtime
-
template<typename TTensor>
class GenericPromptTuningParams -
Public Functions
-
class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<ITensor::SharedPtr>
Public Types
-
using SizeType = GenericPromptTuningParams::SizeType
-
using SizeType = GenericPromptTuningParams::SizeType
-
template<typename TTensor>
-
namespace runtime
samplingConfig.h
tllmLogger.h
worldConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class WorldConfig
Public Functions
-
explicit WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType>> const &deviceIds = std::nullopt)
-
inline constexpr bool isTensorParallel() const noexcept
-
inline constexpr bool isPipelineParallel() const noexcept
-
inline constexpr bool isFirstPipelineParallelRank() const noexcept
-
inline constexpr bool isLastPipelineParallelRank() const noexcept
Is my rank the last rank in its pipeline?
Public Static Functions
-
static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt, std::optional<std::vector<SizeType>> const &deviceIds = std::nullopt)
-
explicit WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<std::vector<SizeType>> const &deviceIds = std::nullopt)
-
class WorldConfig
-
namespace runtime