Runtime
bufferManager.h
-
namespace tensorrt_llm
-
namespace runtime
-
class BufferManager
- #include <bufferManager.h>
A helper class for managing memory on host and device.
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
explicit BufferManager(CudaStreamPtr stream)
Construct a BufferManager.
- Parameters:
cudaStream – [in] The cuda stream to use for all operations on GPU (allocation, de-allocation, copying, etc.).
-
IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBufferof the given size on the GPU.
-
ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensorof the given dimensions on the GPU.
-
IBufferPtr allocate(MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
IBufferof the given size and memory type.
-
ITensorPtr allocate(MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const
Allocates an
ITensorof the given dimensions and memory type.
-
inline IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
IBufferof the given memory type. It may be resized later.
-
inline ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
Create an empty
ITensorof the given memory type. It may be reshaped later.
-
void copy(void const *src, IBuffer &dst, MemoryType srcType) const
Copy
srctodst.
-
void copy(IBuffer const &src, void *dst, MemoryType dstType) const
Copy
srctodst.
-
IBufferPtr copyFrom(IBuffer const &src, MemoryType memoryType) const
Copy
srcinto a newIBufferwith a potentially different memory type.
-
ITensorPtr copyFrom(ITensor const &src, MemoryType memoryType) const
Copy
srcinto a newITensorwith a potentially different memory type.
-
template<typename T>
inline IBufferPtr copyFrom(std::vector<T> const &src, MemoryType memoryType) const Copy
srcinto a newIBufferwith a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(T *src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
srcinto a newITensorwith a potentially different memory type.
-
template<typename T>
inline ITensorPtr copyFrom(std::vector<T> const &src, nvinfer1::Dims dims, MemoryType memoryType) const Copy
srcinto a newITensorwith a potentially different memory type.
-
CudaStream const &getStream() const
Get the underlying cuda stream.
Public Static Functions
-
static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
IBufferof the given size on the CPU.
-
static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates an
ITensorof the given dimensions on the CPU.
-
static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
IBufferof the given size on the CPU.
-
static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE)
Allocates a pinned
ITensorof the given dimensions on the CPU.
Private Members
-
CudaStreamPtr mStream
Private Static Functions
-
static void initMemoryPool(int device)
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class BufferManager
-
namespace runtime
common.h
-
namespace tensorrt_llm
-
namespace runtime
-
namespace runtime
cudaEvent.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaEvent
Public Types
-
using pointer = cudaEvent_t
Public Functions
-
inline explicit CudaEvent(unsigned int flags = cudaEventDisableTiming)
Creates a new cuda event. The event will be destroyed in the destructor.
- Parameters:
flags – Flags for event creation. By default, event timing is disabled.
-
inline explicit CudaEvent(pointer event, bool ownsEvent = true)
Pass an existing cuda event to this object.
- Parameters:
event – The event to pass to this object.
ownsEvent – Whether this object owns the event and destroys it in the destructor.
-
inline void synchronize() const
Synchronizes the event.
Private Types
-
using EventPtr = std::unique_ptr<element_type, Deleter>
-
using pointer = cudaEvent_t
-
class CudaEvent
-
namespace runtime
cudaStream.h
-
namespace tensorrt_llm
-
namespace runtime
-
class CudaStream
Public Functions
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
Creates a new cuda stream on the current device. The stream will be destroyed in the destructor.
- Parameters:
flags – Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed.
priority – Priority of the stream. Lower numbers represent higher priorities. See ::cudaDeviceGetStreamPriorityRange for more information about the meaningful stream priorities that can be passed.
-
inline explicit CudaStream(cudaStream_t stream, int device, bool ownsStream = true)
Pass an existing cuda stream to this object.
- Parameters:
stream – The stream to pass to this object.
device – The device on which the stream was created.
ownsStream – Whether this object owns the stream and destroys it in the destructor.
-
inline int getDevice() const
Returns the device on which the stream was created.
-
inline cudaStream_t get() const
Returns the stream associated with this object.
-
inline void synchronize() const
Synchronizes the stream.
-
inline explicit CudaStream(unsigned int flags = cudaStreamNonBlocking, int priority = 0)
-
class CudaStream
-
namespace runtime
decodingInput.h
decodingOutput.h
-
namespace tensorrt_llm
-
namespace runtime
-
class DecodingOutput
-
Public Members
-
BeamHypotheses beamHypotheses
Public Static Attributes
-
static constexpr float kNegativeInfinity = -1e20f
-
class BeamHypotheses
Public Functions
-
void empty(BufferManager &manager)
-
void release()
-
void init(BufferManager &manager, TokenIdType endId)
-
BeamHypotheses slice(SizeType batchIndex, SizeType size) const
-
void empty(BufferManager &manager)
-
BeamHypotheses beamHypotheses
-
class DecodingOutput
-
namespace runtime
generationInput.h
generationOutput.h
gptDecoder.h
-
namespace tensorrt_llm
-
-
namespace runtime
-
template<typename T>
class GptDecoder : public virtual tensorrt_llm::runtime::IGptDecoder Public Types
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
Public Functions
-
GptDecoder(size_t vocabSize, size_t vocabSizePadded, CudaStreamPtr const &stream)
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize) override
-
virtual bool forward(DecodingOutput &output, DecodingInput const &input) override
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) override
Private Members
-
BufferManager mManager
-
common::CudaAllocator mAllocator
-
std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer
-
using CudaStreamPtr = BufferManager::CudaStreamPtr
-
class IGptDecoder
Subclassed by tensorrt_llm::runtime::GptDecoder< T >
Public Functions
-
virtual ~IGptDecoder() = default
-
virtual void setup(SamplingConfig const &samplingConfig, size_t batchSize) = 0
-
virtual bool forward(DecodingOutput &output, DecodingInput const &input) = 0
-
virtual void forwardAsync(DecodingOutput &output, DecodingInput const &input) = 0
Public Static Functions
-
static void gatherTree(ITensor &finalOutputIds, DecodingOutput const &decodingOutput, DecodingInput const &decodingInput, BufferManager const &manager)
-
static inline std::unique_ptr<IGptDecoder> create(nvinfer1::DataType dtype, size_t vocabSize, size_t vocabSizePadded, BufferManager::CudaStreamPtr const &stream)
-
virtual ~IGptDecoder() = default
-
template<typename T>
-
namespace runtime
gptDecoderBatch.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
- #include <gptDecoderBatch.h>
GPT decoder class with support for in-flight batching.
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
GptDecoderBatch(std::size_t vocabSize, std::size_t vocabSizePadded, CudaStreamPtr stream)
-
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) override
Setup the decoder before calling
forward()
-
virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) override
Initialize the decoder at
batchIdxwith a newrequest.
-
virtual void newBatch(GenerationInput const &inputs, SamplingConfig const &samplingConfig) override
Initialize the decoder with new batch of inputs.
-
virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) override
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::Token const &e) override
Wait for the call to
forwardAsyncassociated with a token to complete.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) override
Run one step for all requests without blocking the host thread.
-
virtual bool isFinishedSync() override
Wait for the last call to
forwardAsyncto complete and return whether all sequences have finished.
-
inline virtual std::vector<bool> getFinished() const override
- Returns:
[batchSize], indicators of finished requests
-
inline virtual TensorPtr getOutputIds(SizeType batchIdx) const override
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx, on gpu
-
inline virtual TensorPtr getOutputIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu
-
virtual std::tuple<CudaEvent, TensorPtr> getFinalOutputIds(SizeType batchIdx) const override
Execute postProcessRequest and returns OutputIds for request
batchIdx. Result will only be available after event returned- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx, on gpu
-
virtual TensorPtr getFinalOutputIds() const override
Execute postProcessRequest and returns OutputIds.
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding, on gpu
-
inline virtual TensorPtr getParentIds() const override
- Returns:
[batchSize, maxBeamWidth, maxInputLength + maxNewTokens], contains parent ids collected during beam search without padding, on gpu
-
inline virtual TensorPtr getFinishedBeams() const override
- Returns:
[batchSize, maxBeamWidth], marks finished requests (per beam), on gpu
-
inline virtual TensorPtr getOutputLengths() const override
- Returns:
[batchSize, maxBeamWidth], total sequence lengths (per beam), on gpu
-
inline virtual TensorPtr getCumLogProbs() const override
- Returns:
[batchSize, maxBeamWidth], cumulative log probabilities (per beam), on gpu
-
inline virtual TensorPtr getNewTokens() const override
- Returns:
[batchSize, maxBeamWidth], tokens generated in last forward pass, on gpu
Private Types
-
using GptDecoderPtr = std::unique_ptr<IGptDecoder>
-
using DecodingInputPtr = std::unique_ptr<DecodingInput>
-
using DecodingOutputPtr = std::unique_ptr<DecodingOutput>
Private Functions
Private Members
-
const std::size_t mVocabSize
-
const std::size_t mVocabSizePadded
-
CudaStreamPtr mStream
-
BufferManager mBufferManager
-
TokenPtr mForwardToken
-
std::vector<CudaStreamPtr> mStreams
-
std::vector<GptDecoderPtr> mDecoders
-
std::vector<DecodingInputPtr> mDecodingInputs
-
std::vector<DecodingOutputPtr> mDecodingOutputs
-
DecodingInputPtr mJointDecodingInput
-
DecodingOutputPtr mJointDecodingOutput
-
std::vector<bool> mFinished
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
class GptDecoderBatch : public tensorrt_llm::runtime::IGptDecoderBatch
-
namespace runtime
gptJsonConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptJsonConfig
Public Functions
-
inline GptJsonConfig(std::string name, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)
-
inline GptModelConfig getModelConfig() const
-
inline std::string const &getName() const
-
inline std::string const &getPrecision() const
-
std::string engineFilename(WorldConfig const &worldConfig, std::string const &model) const
-
inline std::string engineFilename(WorldConfig const &worldConfig) const
Public Static Functions
-
static GptJsonConfig parse(std::string const &json)
-
static GptJsonConfig parse(std::istream &json)
-
static GptJsonConfig parse(std::filesystem::path const &path)
-
inline GptJsonConfig(std::string name, std::string precision, SizeType tensorParallelism, SizeType pipelineParallelism, GptModelConfig const &modelConfig)
-
class GptJsonConfig
-
namespace runtime
gptModelConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class GptModelConfig
-
Public Functions
-
inline explicit constexpr GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
-
inline constexpr bool useGptAttentionPlugin() const noexcept
-
inline constexpr void useGptAttentionPlugin(bool useGptAttentionPlugin) noexcept
-
inline constexpr bool usePackedInput() const noexcept
-
inline constexpr void usePackedInput(bool inputPacked) noexcept
-
inline constexpr bool usePagedKvCache() const noexcept
-
inline constexpr void usePagedKvCache(bool pagedKvCache) noexcept
-
inline constexpr common::QuantMode getQuantMode() const noexcept
-
inline constexpr void setQuantMode(common::QuantMode QuantMode) noexcept
-
inline constexpr bool supportsInflightBatching() const noexcept
-
inline constexpr bool computeContextLogits() const noexcept
-
inline constexpr void computeContextLogits(bool computeContextLogits) noexcept
-
inline ModelVariant getModelVariant() const
-
inline void setModelVariant(ModelVariant modelVariant)
-
inline constexpr bool useCustomAllReduce() const noexcept
-
inline constexpr void useCustomAllReduce(bool customAllReduce) noexcept
-
inline explicit constexpr GptModelConfig(SizeType vocabSize, SizeType nbLayers, SizeType nbHeads, SizeType hiddenSize, nvinfer1::DataType dtype)
-
class GptModelConfig
-
namespace runtime
gptSession.h
-
namespace tensorrt_llm
-
-
namespace runtime
-
class GptSession
-
Public Functions
-
GptSession(GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
-
inline GptSession(GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::vector<uint8_t> const &engineBuffer, LoggerPtr logger = nullptr)
-
inline GptSession(GptModelConfig const &modelConfig, WorldConfig const &worldConfig, std::string const &engineFile, LoggerPtr logger = nullptr)
-
BufferManager &getBufferManager() const
-
inline GptModelConfig const &getModelConfig() const
-
inline WorldConfig const &getWorldConfig() const
-
inline int getDevice() const noexcept
-
inline bool isCudaGraphMode() const noexcept
-
inline void setCudaGraphMode(bool value)
-
void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, bool decoderPerRequest, std::optional<SizeType> maxTokensInPagedKvCache = std::nullopt, std::optional<SizeType> numMicroBatches = std::nullopt)
Initialize buffers for the given sizes.
generatemay be called with batch size and beam width smaller than the setup parameters.maxBatchSizewill be devided by the number of micro batches to initialize each batch buffer.
-
inline void generate(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)
Private Types
-
using KvCacheManager = batch_manager::kv_cache_manager::KVCacheManager
Private Functions
-
void generateSingleBatch(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)
-
void generateMultiBatch(GenerationOutput &outputs, GenerationInput const &inputs, SamplingConfig const &samplingConfig)
-
void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches)
-
void createKvCacheManagers(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, SizeType numMicroBatches, std::optional<SizeType> maxTokensInPagedKvCache)
-
void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength)
Execute decoder on last PP rank, receive decoder output on other PP ranks.
-
bool shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId)
Synchronize with the decoder and return the
shouldStopflag.
-
void finalizeOutputIds(ITensor &outputIds, SizeType microBatchId)
Collect final output ids on last PP rank and send them to first PP rank.
Receives are asynchronous on host, so synchronization is required before access.
-
ITensor::SharedPtr initNewTokens(GenerationInput const &inputs, SamplingConfig const &samplingConfig, SizeType microBatchId)
Private Members
-
const GptModelConfig mModelConfig
-
const WorldConfig mWorldConfig
-
int mDevice = {-1}
-
std::shared_ptr<NcclCommunicator> mPipelineComm
-
std::shared_ptr<CudaStream> mCommStream
-
std::shared_ptr<TllmRuntime> mRuntime
-
std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders
-
std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers
-
std::vector<std::shared_ptr<KvCacheManager>> mKvCacheManagers
-
bool mCudaGraphMode = {false}
-
std::array<CudaGraphExecutor, 2> mCudaGraphInstances
-
class CudaGraphExecutor
Public Functions
-
CudaGraphExecutor() = default
-
inline ~CudaGraphExecutor()
-
inline bool hasInstance()
-
void clear()
-
void launch(CudaStream const &stream)
Private Types
-
using cudaGraphExecPtr = cudaGraphExec_t
Private Functions
-
void create(cudaGraph_t const &graph)
-
bool update(cudaGraph_t const &graph)
-
void uploadToStream(CudaStream const &stream)
Private Members
-
cudaGraphExecPtr mInstance
-
CudaGraphExecutor() = default
-
GptSession(GptModelConfig const &modelConfig, WorldConfig const &worldConfig, void const *engineBuffer, std::size_t engineSize, LoggerPtr logger = nullptr)
-
class GptSession
-
namespace runtime
iBuffer.h
-
template<>
struct MemoryTypeString<MemoryType::kGPU> Public Static Attributes
-
static constexpr auto value = "GPU"
-
static constexpr auto value = "GPU"
-
template<>
struct MemoryTypeString<MemoryType::kCPU> Public Static Attributes
-
static constexpr auto value = "CPU"
-
static constexpr auto value = "CPU"
-
template<>
struct MemoryTypeString<MemoryType::kPINNED> Public Static Attributes
-
static constexpr auto value = "PINNED"
-
static constexpr auto value = "PINNED"
-
template<>
struct CppDataType<nvinfer1::DataType::kINT32, true> Public Types
-
using type = std::uint32_t
-
using type = std::uint32_t
-
template<>
struct CppDataType<nvinfer1::DataType::kINT64, true> Public Types
-
using type = std::uint64_t
-
using type = std::uint64_t
-
template<bool kUnsigned>
struct CppDataType<nvinfer1::DataType::kBOOL, kUnsigned> Public Types
-
using type = bool
-
using type = bool
-
template<bool kUnsigned>
struct CppDataType<nvinfer1::DataType::kUINT8, kUnsigned> Public Types
-
using type = std::uint8_t
-
using type = std::uint8_t
-
template<>
struct TRTDataType<std::int8_t>
-
template<>
struct TRTDataType<std::int32_t>
-
template<>
struct TRTDataType<std::uint32_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
template<>
struct TRTDataType<std::int64_t>
-
template<>
struct TRTDataType<std::uint64_t> Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
template<>
struct TRTDataType<std::uint8_t>
-
namespace tensorrt_llm
-
namespace runtime
Typedefs
Enums
Functions
-
class BufferDataType
- #include <iBuffer.h>
A wrapper around
nvinfer1::DataTypethat provides a support for pointer types.
-
template<typename T>
class BufferRange Public Types
-
using size_type = std::size_t
-
using reference = value_type&
-
using const_reference = value_type const&
-
using const_iterator = const_pointer
Public Functions
-
inline const_iterator begin() const
-
inline const_iterator end() const
-
inline const_iterator cbegin()
-
inline const_iterator cend()
-
inline const_iterator cbegin() const
-
inline const_iterator cend() const
-
inline const_reference operator[](size_type index) const
-
using size_type = std::size_t
-
template<nvinfer1::DataType kDataType, bool kIsUnsigned = false, bool kIsPointer = false>
struct CppDataType - #include <iBuffer.h>
For converting a TensorRT data type to a C++ data type.
-
template<nvinfer1::DataType kDataType, bool kUnsigned>
struct CppDataType<kDataType, kUnsigned, true>
- template<bool kUnsigned> kBOOL, kUnsigned >
Public Types
-
using type = bool
-
using type = bool
- template<> kFLOAT >
Public Types
-
using type = float
-
using type = float
- template<> kHALF >
Public Types
-
using type = half
-
using type = half
- template<> kINT32 >
Public Types
-
using type = std::int32_t
-
using type = std::int32_t
- template<> kINT32, true >
Public Types
-
using type = std::uint32_t
-
using type = std::uint32_t
- template<> kINT64 >
Public Types
-
using type = std::int64_t
-
using type = std::int64_t
- template<> kINT64, true >
Public Types
-
using type = std::uint64_t
-
using type = std::uint64_t
- template<> kINT8 >
Public Types
-
using type = std::int8_t
-
using type = std::int8_t
- template<bool kUnsigned> kUINT8, kUnsigned >
Public Types
-
using type = std::uint8_t
-
using type = std::uint8_t
-
class IBuffer
Subclassed by tensorrt_llm::runtime::ITensor
Public Types
Public Functions
-
virtual void *data() = 0
Returns a pointer to underlying array.
-
virtual void const *data() const = 0
Returns a pointer to underlying array.
-
inline virtual void *data(std::size_t index)
Returns a pointer to the underlying array at a given element index.
-
inline virtual void const *data(std::size_t index) const
Returns a pointer to the underlying array at a given element index.
-
virtual std::size_t getSize() const = 0
Returns the size (in number of elements) of the buffer.
-
inline virtual std::size_t getSizeInBytes() const
Returns the size (in bytes) of the buffer.
-
virtual std::size_t getCapacity() const = 0
Returns the capacity of the buffer.
-
virtual MemoryType getMemoryType() const = 0
Returns the memory type of the buffer.
-
virtual void resize(std::size_t newSize) = 0
Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
-
virtual void release() = 0
Releases the buffer. It will be reset to nullptr.
-
virtual ~IBuffer() = default
Public Static Functions
Creates a sliced view on the underlying
buffer. The view will have the same data type asbuffer.- Parameters:
buffer – The buffer to view.
offset – The offset of the view.
size – The size of the view.
- Returns:
A view on the
buffer.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
tensorwhich can be independently resized.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor.
Returns a view on the underlying
tensorwith a different size.- Parameters:
tensor – The tensor to view.
size – The size of the view.
- Returns:
A view on the
tensor.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, std::size_t size)
-
static UniquePtr wrap(void *data, DataType type, std::size_t size, std::size_t capacity)
Wraps the given
datain anIBuffer. TheIBufferwill not own the underlyingdataand cannot be resized beyondcapacity.- Parameters:
data – The data to wrap.
type – The data type of the
data.size – The size of the buffer.
capacity – The capacity of the buffer.
- Returns:
An
IBuffer.
-
static MemoryType memoryType(void const *data)
Determine the memory type of a pointer.
-
virtual void *data() = 0
-
template<MemoryType T>
struct MemoryTypeString
- template<> kCPU >
Public Static Attributes
-
static constexpr auto value = "CPU"
-
static constexpr auto value = "CPU"
- template<> kGPU >
Public Static Attributes
-
static constexpr auto value = "GPU"
-
static constexpr auto value = "GPU"
- template<> kPINNED >
Public Static Attributes
-
static constexpr auto value = "PINNED"
-
static constexpr auto value = "PINNED"
-
template<typename T, bool = false>
struct TRTDataType - #include <iBuffer.h>
For converting a C++ data type to a TensorRT data type.
-
template<>
struct TRTDataType<bool>
-
template<>
struct TRTDataType<float>
-
template<>
struct TRTDataType<half>
- template<> int32_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT32
-
static constexpr auto value = nvinfer1::DataType::kINT32
- template<> int64_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT64
-
static constexpr auto value = nvinfer1::DataType::kINT64
- template<> int8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kINT8
-
static constexpr auto value = nvinfer1::DataType::kINT8
- template<> uint32_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT32, true}
- template<> uint64_t >
Public Static Attributes
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
-
static constexpr auto value = BufferDataType{nvinfer1::DataType::kINT64, true}
- template<> uint8_t >
Public Static Attributes
-
static constexpr auto value = nvinfer1::DataType::kUINT8
-
static constexpr auto value = nvinfer1::DataType::kUINT8
-
template<typename T>
struct TRTDataType<T*> Public Static Attributes
-
static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
Private Static Attributes
-
static constexpr auto kUnderlyingType = BufferDataType{TRTDataType<T, false>::value}
-
static constexpr auto value = BufferDataType{kUnderlyingType.getDataType(), kUnderlyingType.isUnsigned(), true}
-
template<>
struct TRTDataType<void*> Public Static Attributes
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
static constexpr auto value = BufferDataType::kTrtPointerType
-
class BufferDataType
-
namespace runtime
iGptDecoderBatch.h
-
namespace tensorrt_llm
-
namespace runtime
-
class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
- #include <iGptDecoderBatch.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::GptDecoderBatch
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
using TokenPtr = std::unique_ptr<decoder_batch::Token const>
Public Functions
-
virtual void newRequest(SizeType batchIdx, decoder_batch::Request const &request, SamplingConfig const &samplingConfig) = 0
Initialize the decoder at
batchIdxwith a newrequest.
-
virtual TokenPtr forwardAsync(decoder_batch::Output &output, decoder_batch::Input const &input) = 0
Run one step for all requests without blocking the host process and return the token for synchronization.
-
virtual void forwardSync(decoder_batch::Token const &token) = 0
Wait for the call to
forwardAsyncassociated with a token to complete.
-
inline virtual void forward(decoder_batch::Output &output, decoder_batch::Input const &input)
Run one step for all requests and wait for completion on the host.
-
virtual TensorPtr getOutputIds(SizeType batchIdx) const = 0
- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx, on gpu
-
virtual std::tuple<CudaEvent, TensorPtr> getFinalOutputIds(SizeType batchIdx) const = 0
Execute postProcessRequest and returns OutputIds for request
batchIdx. Result will only be available after event returned- Returns:
[maxBeamWidth, maxInputLength + maxNewTokens], contains input token ids and generated token ids without padding for request
batchIdx, on gpu
-
virtual TensorPtr getFinishedBeams() const = 0
- Returns:
[batchSize, beamWidth], marks finished requests (per beam), on gpu
-
virtual TensorPtr getOutputLengths() const = 0
- Returns:
[batchSize, beamWidth], total sequence lengths (per beam), on gpu
-
virtual std::vector<bool> getFinished() const = 0
- Returns:
[batchSize (actual)], marks finished requests (per batch)
Protected Functions
-
IGptDecoderBatch() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
namespace decoder_batch
-
-
class Input : public tensorrt_llm::runtime::decoder::Input
-
Public Functions
-
inline explicit Input(TensorPtr logits)
-
inline explicit Input(TensorPtr logits, std::vector<bool> const &active)
Public Members
-
std::vector<bool> active
-
inline explicit Input(TensorPtr logits)
-
class Request
-
Public Functions
-
class Token
-
class Input : public tensorrt_llm::runtime::decoder::Input
-
class IGptDecoderBatch : public virtual tensorrt_llm::runtime::IStatefulGptDecoder
-
namespace runtime
iStatefulGptDecoder.h
-
namespace tensorrt_llm
-
namespace runtime
-
class IStatefulGptDecoder
- #include <iStatefulGptDecoder.h>
GPT decoder class with support for in-flight batching.
Subclassed by tensorrt_llm::runtime::IGptDecoderBatch
Public Types
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
Public Functions
-
virtual void setup(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, nvinfer1::DataType dtype) = 0
Setup the decoder before calling
forward(), also calls reshapeBuffers.
-
virtual void newBatch(GenerationInput const &inputs, SamplingConfig const &samplingConfig) = 0
Initialize the decoder with new batch of inputs.
-
virtual void forwardAsync(decoder::Output &output, decoder::Input const &input) = 0
Run one step for all requests without blocking the host thread.
-
virtual bool isFinishedSync() = 0
Wait for the last call to
forwardAsyncto complete and return whether all sequences have finished.
-
inline virtual bool forward(decoder::Output &output, decoder::Input const &input)
Run one step for all requests.
-
virtual TensorPtr getOutputIds() const = 0
- Returns:
[batchSize, beamWidth, maxSequenceLength], all token ids, on gpu
Protected Functions
-
IStatefulGptDecoder() = default
-
using CudaStreamPtr = std::shared_ptr<CudaStream>
-
namespace decoder
-
class Input
Subclassed by tensorrt_llm::runtime::decoder_batch::Input
-
class Input
-
class IStatefulGptDecoder
-
namespace runtime
iTensor.h
-
namespace nvinfer1
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
Public Types
Public Functions
-
virtual void reshape(Shape const &dims) = 0
Sets the tensor dimensions. The new size of the tensor will be
volume(dims)
-
~ITensor() override = default
Public Static Functions
-
static inline std::int64_t volume(Shape const &dims)
Returns the volume of the dimensions. Returns -1 if
d.nbDims < 0.
-
static inline std::size_t volumeNonNegative(Shape const &shape)
Returns the volume of the dimensions. Throws if
d.nbDims < 0.
-
static inline Shape squeeze(Shape const &shape, SizeType dim)
Removes the given unit dimension from
shape.- Parameters:
shape – The shape to squeeze.
dim – The dimension that should be removed (“squeezed”).
- Returns:
A new shape without the unit dimension.
Creates a sliced view on the underlying
tensor. The view will have the same data type astensor.- Parameters:
tensor – The tensor to view.
offset – The offset of the view w.r.t. dimension 0 of the tensor.
size – The size of the view w.r.t. dimension 0 of the tensor.
- Returns:
A view on the
buffer.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset, std::size_t size)
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr slice(TConstPtr &&tensor, std::size_t offset)
Returns a view on the underlying
buffer(or tensor) with the given shape.- Parameters:
tensor – The tensor to view.
shape – The shape of the view.
- Returns:
A view on the
tensor.
-
template<typename TConstPtr, std::enable_if_t<std::is_const_v<PointerElementType<TConstPtr>>, int> = 0>
static inline UniqueConstPtr view(TConstPtr &&tensor, Shape const &dims)
Returns a view on the underlying
tensorwhich can be independently reshaped.- Parameters:
tensor – The tensor to view.
- Returns:
A view on the
tensor.
-
static UniquePtr wrap(void *data, nvinfer1::DataType type, Shape const &shape, std::size_t capacity)
Wraps the given
datain anITensor. TheITensorwill not own the underlyingdataand cannot be reshaped beyondcapacity.- Parameters:
data – The data to wrap.
type – The data type of the
data.shape – The shape of the tensor.
capacity – The capacity of the buffer.
- Returns:
An
ITensor.
-
template<typename T>
static inline UniquePtr wrap(T *data, Shape const &shape, std::size_t capacity)
Protected Functions
-
ITensor() = default
-
virtual void reshape(Shape const &dims) = 0
-
class ITensor : public virtual tensorrt_llm::runtime::IBuffer
-
namespace runtime
ipcUtils.h
-
namespace tensorrt_llm
-
namespace runtime
Functions
-
void setPeerAccess(WorldConfig worldConfig, bool enable = true)
-
class IpcMemory
-
Public Functions
-
IpcMemory(WorldConfig worldConfig, std::size_t bufferSize)
-
~IpcMemory()
-
inline const std::vector<void*> &getCommPtrsTensor() const
Public Static Attributes
-
static constexpr size_t FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t)
Private Members
-
WorldConfig mWorldConfig
-
std::vector<void*> mCommPtrs
-
std::size_t mBufferSize
-
void *mBufferPtr
-
IpcMemory(WorldConfig worldConfig, std::size_t bufferSize)
-
void setPeerAccess(WorldConfig worldConfig, bool enable = true)
-
namespace runtime
memoryCounters.h
-
namespace tensorrt_llm
-
namespace runtime
-
class MemoryCounters
-
Public Functions
-
MemoryCounters() = default
-
template<MemoryType T>
inline void allocate(SizeType size)
-
void allocate(MemoryType memoryType, SizeType size)
-
template<MemoryType T>
inline void deallocate(SizeType size)
-
void deallocate(MemoryType memoryType, SizeType size)
Public Static Functions
-
static inline MemoryCounters &getInstance()
Private Members
Private Static Attributes
-
static thread_local MemoryCounters mInstance
-
MemoryCounters() = default
-
class MemoryCounters
-
namespace runtime
samplingConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class SamplingConfig
-
Public Members
-
class SamplingConfig
-
namespace runtime
tllmLogger.h
-
namespace tensorrt_llm
-
namespace runtime
-
class TllmLogger : public ILogger
-
class TllmLogger : public ILogger
-
namespace runtime
worldConfig.h
-
namespace tensorrt_llm
-
namespace runtime
-
class WorldConfig
Public Functions
-
inline explicit constexpr WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode)
-
inline constexpr bool isTensorParallel() const noexcept
-
inline constexpr bool isPipelineParallel() const noexcept
-
inline constexpr bool isFirstPipelineParallelRank() const noexcept
-
inline constexpr bool isLastPipelineParallelRank() const noexcept
Public Static Functions
-
static bool validConfig(nvinfer1::ILogger &logger, SizeType tensorParallelism, SizeType pipelineParallelism)
-
static WorldConfig mpi(nvinfer1::ILogger &logger, SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt)
-
static WorldConfig mpi(SizeType gpusPerNode = kDefaultGpusPerNode, std::optional<SizeType> tensorParallelism = std::nullopt, std::optional<SizeType> pipelineParallelism = std::nullopt)
-
inline explicit constexpr WorldConfig(SizeType tensorParallelism = 1, SizeType pipelineParallelism = 1, SizeType rank = 0, SizeType gpusPerNode = kDefaultGpusPerNode)
-
class WorldConfig
-
namespace runtime