tensorrt_llm
Contents:
TensorRT-LLM Architecture
C++ GPT Runtime
The Batch Manager in TensorRT-LLM
Multi-head, Multi-query and Group-query Attention
Numerical Precision
Performance of TensorRT-LLM
Build From Sources
How to debug
How to add a new model
Graph Rewriting Module
Python API
Layers
Functionals
Models
Plugin
Quantization
Runtime
C++ API
Runtime
tensorrt_llm
Index
Index
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
K
|
L
|
M
|
N
|
O
|
P
|
Q
|
R
|
S
|
T
|
U
|
V
|
W
A
abs() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
activation() (in module tensorrt_llm.functional)
add() (in module tensorrt_llm.functional)
add_sequence() (tensorrt_llm.runtime.KVCacheManager method)
alibi (tensorrt_llm.functional.PositionEmbeddingType attribute)
alibi_with_scale (tensorrt_llm.functional.PositionEmbeddingType attribute)
allgather() (in module tensorrt_llm.functional)
allreduce() (in module tensorrt_llm.functional)
AllReduceStrategy (class in tensorrt_llm.functional)
arange() (in module tensorrt_llm.functional)
argmax() (in module tensorrt_llm.functional)
assertion() (in module tensorrt_llm.functional)
Attention (class in tensorrt_llm.layers.attention)
AttentionMaskType (class in tensorrt_llm.functional)
AttentionParams (class in tensorrt_llm.layers.attention)
AUTO (tensorrt_llm.functional.AllReduceStrategy attribute)
avg_pool2d() (in module tensorrt_llm.functional)
AvgPool2d (class in tensorrt_llm.layers.pooling)
B
BaichuanForCausalLM (class in tensorrt_llm.models)
batch_size (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
beam_search_diversity_rate (tensorrt_llm.runtime.SamplingConfig attribute)
bert_attention() (in module tensorrt_llm.functional)
BertAttention (class in tensorrt_llm.layers.attention)
BertForQuestionAnswering (class in tensorrt_llm.models)
BertModel (class in tensorrt_llm.models)
bidirectional (tensorrt_llm.functional.AttentionMaskType attribute)
BloomForCausalLM (class in tensorrt_llm.models)
BloomModel (class in tensorrt_llm.models)
broadcast_helper() (in module tensorrt_llm.functional)
buffer_allocated (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
C
Cast (class in tensorrt_llm.layers.cast)
cast() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
causal (tensorrt_llm.functional.AttentionMaskType attribute)
ChatGLM2HeadModel (class in tensorrt_llm.models)
ChatGLM2Model (class in tensorrt_llm.models)
ChatGLM6BHeadModel (class in tensorrt_llm.models)
ChatGLM6BHeadModelGenerationSession (class in tensorrt_llm.runtime)
ChatGLM6BModel (class in tensorrt_llm.models)
choices() (tensorrt_llm.functional.PositionEmbeddingType static method)
chunk() (in module tensorrt_llm.functional)
clip() (in module tensorrt_llm.functional)
ColumnLinear (in module tensorrt_llm.layers.linear)
concat() (in module tensorrt_llm.functional)
constant() (in module tensorrt_llm.functional)
constant_to_tensor_() (in module tensorrt_llm.functional)
context (tensorrt_llm.runtime.Session property)
Conv2d (class in tensorrt_llm.layers.conv)
conv2d() (in module tensorrt_llm.functional)
conv_transpose2d() (in module tensorrt_llm.functional)
ConvTranspose2d (class in tensorrt_llm.layers.conv)
cos() (in module tensorrt_llm.functional)
cross_attention (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
cuda_graph_mode (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
cuda_stream_guard() (tensorrt_llm.runtime.GenerationSession method)
D
debug_mode (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
debug_tensors_to_save (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
decode() (tensorrt_llm.runtime.GenerationSession method)
decode_batch() (tensorrt_llm.runtime.GenerationSession method)
decode_regular() (tensorrt_llm.runtime.GenerationSession method)
decode_stream() (tensorrt_llm.runtime.GenerationSession method)
DecoderModel (class in tensorrt_llm.models)
device (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
DimRange (class in tensorrt_llm.functional)
div() (in module tensorrt_llm.functional)
dtype (tensorrt_llm.functional.Tensor property)
(tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
(tensorrt_llm.runtime.TensorInfo attribute)
dynamic (tensorrt_llm.functional.RotaryScalingType attribute)
E
einsum() (in module tensorrt_llm.functional)
elementwise_binary() (in module tensorrt_llm.functional)
Embedding (class in tensorrt_llm.layers.embedding)
embedding() (in module tensorrt_llm.functional)
EncoderModel (class in tensorrt_llm.models)
end_id (tensorrt_llm.runtime.SamplingConfig attribute)
engine (tensorrt_llm.runtime.Session property)
eq() (in module tensorrt_llm.functional)
exp() (in module tensorrt_llm.functional)
expand() (in module tensorrt_llm.functional)
expand_dims() (in module tensorrt_llm.functional)
expand_dims_like() (in module tensorrt_llm.functional)
expand_mask() (in module tensorrt_llm.functional)
F
FalconForCausalLM (class in tensorrt_llm.models)
FalconModel (class in tensorrt_llm.models)
finalize_decoder() (tensorrt_llm.runtime.GenerationSession method)
first_layer (tensorrt_llm.runtime.GenerationSession property)
flip() (in module tensorrt_llm.functional)
forward() (tensorrt_llm.layers.activation.Mish method)
(tensorrt_llm.layers.attention.Attention method)
(tensorrt_llm.layers.attention.BertAttention method)
(tensorrt_llm.layers.cast.Cast method)
(tensorrt_llm.layers.conv.Conv2d method)
(tensorrt_llm.layers.conv.ConvTranspose2d method)
(tensorrt_llm.layers.embedding.Embedding method)
(tensorrt_llm.layers.embedding.PromptTuningEmbedding method)
(tensorrt_llm.layers.linear.Linear method)
(tensorrt_llm.layers.linear.RowLinear method)
(tensorrt_llm.layers.mlp.GatedMLP method)
(tensorrt_llm.layers.mlp.MLP method)
(tensorrt_llm.layers.normalization.GroupNorm method)
(tensorrt_llm.layers.normalization.LayerNorm method)
(tensorrt_llm.layers.normalization.RmsNorm method)
(tensorrt_llm.layers.pooling.AvgPool2d method)
(tensorrt_llm.models.BaichuanForCausalLM method)
(tensorrt_llm.models.BertForQuestionAnswering method)
(tensorrt_llm.models.BertModel method)
(tensorrt_llm.models.BloomForCausalLM method)
(tensorrt_llm.models.BloomModel method)
(tensorrt_llm.models.ChatGLM2HeadModel method)
(tensorrt_llm.models.ChatGLM2Model method)
(tensorrt_llm.models.ChatGLM6BHeadModel method)
(tensorrt_llm.models.ChatGLM6BModel method)
(tensorrt_llm.models.DecoderModel method)
(tensorrt_llm.models.EncoderModel method)
(tensorrt_llm.models.FalconForCausalLM method)
(tensorrt_llm.models.FalconModel method)
(tensorrt_llm.models.GPTJForCausalLM method)
(tensorrt_llm.models.GPTJModel method)
(tensorrt_llm.models.GPTLMHeadModel method)
(tensorrt_llm.models.GPTModel method)
(tensorrt_llm.models.GPTNeoXForCausalLM method)
(tensorrt_llm.models.GPTNeoXModel method)
(tensorrt_llm.models.LLaMAForCausalLM method)
(tensorrt_llm.models.LLaMAModel method)
(tensorrt_llm.models.OPTLMHeadModel method)
(tensorrt_llm.models.OPTModel method)
fp8_quantize() (in module tensorrt_llm.models)
from_engine() (tensorrt_llm.runtime.Session static method)
from_serialized_engine() (tensorrt_llm.runtime.Session static method)
G
GatedMLP (class in tensorrt_llm.layers.mlp)
gather() (in module tensorrt_llm.functional)
gather_all_token_logits (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
gather_last_token_logits() (in module tensorrt_llm.functional)
geglu() (in module tensorrt_llm.functional)
gelu() (in module tensorrt_llm.functional)
generate_alibi_biases() (in module tensorrt_llm.functional)
generate_alibi_slopes() (in module tensorrt_llm.functional)
GenerationSequence (class in tensorrt_llm.runtime)
GenerationSession (class in tensorrt_llm.runtime)
get_batch_idx() (tensorrt_llm.runtime.GenerationSequence method)
get_first_kv_cache_block_pointers() (tensorrt_llm.layers.attention.KeyValueCacheParams method)
get_first_past_key_value() (tensorrt_llm.layers.attention.KeyValueCacheParams method)
get_parent() (tensorrt_llm.functional.Tensor method)
get_pointer_arrays() (tensorrt_llm.runtime.KVCacheManager method)
get_seq_idx() (tensorrt_llm.runtime.GenerationSequence method)
get_users() (tensorrt_llm.functional.Tensor method)
gpt_attention() (in module tensorrt_llm.functional)
gpt_attention_plugin (tensorrt_llm.runtime.ModelConfig attribute)
GPTJForCausalLM (class in tensorrt_llm.models)
GPTJModel (class in tensorrt_llm.models)
GPTLMHeadModel (class in tensorrt_llm.models)
GPTModel (class in tensorrt_llm.models)
GPTNeoXForCausalLM (class in tensorrt_llm.models)
GPTNeoXModel (class in tensorrt_llm.models)
group_norm() (in module tensorrt_llm.functional)
GroupNorm (class in tensorrt_llm.layers.normalization)
(tensorrt_llm.functional.LayerNormType attribute)
gt() (in module tensorrt_llm.functional)
H
handle_per_step() (tensorrt_llm.runtime.GenerationSession method)
has_position_embedding (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
has_token_type_embedding (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
head_size (tensorrt_llm.runtime.GenerationSession property)
hidden_size (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
I
identity() (in module tensorrt_llm.functional)
index_select() (in module tensorrt_llm.functional)
infer_shapes() (tensorrt_llm.runtime.Session method)
interpolate() (in module tensorrt_llm.functional)
is_alibi() (tensorrt_llm.functional.PositionEmbeddingType method)
is_dynamic() (tensorrt_llm.functional.Tensor method)
is_gated_activation() (in module tensorrt_llm.functional)
is_rope() (tensorrt_llm.functional.PositionEmbeddingType method)
is_trt_wrapper() (tensorrt_llm.functional.Tensor method)
is_valid() (tensorrt_llm.layers.attention.AttentionParams method)
(tensorrt_llm.layers.attention.KeyValueCacheParams method)
is_valid_cross_attn() (tensorrt_llm.layers.attention.AttentionParams method)
K
KeyValueCacheParams (class in tensorrt_llm.layers.attention)
KVCacheManager (class in tensorrt_llm.runtime)
L
last_layer (tensorrt_llm.runtime.GenerationSession property)
layer_norm() (in module tensorrt_llm.functional)
LayerNorm (class in tensorrt_llm.layers.normalization)
(tensorrt_llm.functional.LayerNormType attribute)
LayerNormPositionType (class in tensorrt_llm.functional)
LayerNormType (class in tensorrt_llm.functional)
learned_absolute (tensorrt_llm.functional.PositionEmbeddingType attribute)
length_penalty (tensorrt_llm.runtime.SamplingConfig attribute)
Linear (class in tensorrt_llm.layers.linear)
linear (tensorrt_llm.functional.RotaryScalingType attribute)
LLaMAForCausalLM (class in tensorrt_llm.models)
LLaMAModel (class in tensorrt_llm.models)
location (tensorrt_llm.functional.Tensor property)
lt() (in module tensorrt_llm.functional)
M
mapping (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
mark_output() (tensorrt_llm.functional.Tensor method)
matmul() (in module tensorrt_llm.functional)
max() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
maximum() (in module tensorrt_llm.functional)
mean() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
min_length (tensorrt_llm.runtime.SamplingConfig attribute)
minimum() (in module tensorrt_llm.functional)
Mish (class in tensorrt_llm.layers.activation)
MLP (class in tensorrt_llm.layers.mlp)
model_name (tensorrt_llm.runtime.ModelConfig attribute)
ModelConfig (class in tensorrt_llm.runtime)
module
tensorrt_llm
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
tensorrt_llm.functional
tensorrt_llm.layers.activation
tensorrt_llm.layers.attention
tensorrt_llm.layers.cast
tensorrt_llm.layers.conv
tensorrt_llm.layers.embedding
tensorrt_llm.layers.linear
tensorrt_llm.layers.mlp
tensorrt_llm.layers.normalization
tensorrt_llm.layers.pooling
tensorrt_llm.models
tensorrt_llm.plugin
tensorrt_llm.quantization
tensorrt_llm.runtime
mul() (in module tensorrt_llm.functional)
multiply_gather() (tensorrt_llm.layers.linear.Linear method)
multiply_reduce() (tensorrt_llm.layers.linear.RowLinear method)
N
name (tensorrt_llm.functional.Tensor property)
(tensorrt_llm.runtime.TensorInfo attribute)
ndim() (tensorrt_llm.functional.Tensor method)
non_gated_version() (in module tensorrt_llm.functional)
none (tensorrt_llm.functional.RotaryScalingType attribute)
num_beams (tensorrt_llm.runtime.SamplingConfig attribute)
num_heads (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
num_heads_kv (tensorrt_llm.runtime.GenerationSession property)
num_kv_heads (tensorrt_llm.runtime.ModelConfig attribute)
num_layers (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
nvinfer1 (C++ type)
O
ONESHOT (tensorrt_llm.functional.AllReduceStrategy attribute)
op_and() (in module tensorrt_llm.functional)
op_or() (in module tensorrt_llm.functional)
OPTLMHeadModel (class in tensorrt_llm.models)
OPTModel (class in tensorrt_llm.models)
outer() (in module tensorrt_llm.functional)
output_cum_log_probs (tensorrt_llm.runtime.SamplingConfig attribute)
output_log_probs (tensorrt_llm.runtime.SamplingConfig attribute)
P
pad_id (tensorrt_llm.runtime.SamplingConfig attribute)
padding (tensorrt_llm.functional.AttentionMaskType attribute)
paged_kv_cache (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
permute() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
PositionEmbeddingType (class in tensorrt_llm.functional)
post_layernorm (tensorrt_llm.functional.LayerNormPositionType attribute)
pow() (in module tensorrt_llm.functional)
pp_communicate_final_output_ids() (tensorrt_llm.runtime.GenerationSession method)
pp_communicate_new_tokens() (tensorrt_llm.runtime.GenerationSession method)
pre_layernorm (tensorrt_llm.functional.LayerNormPositionType attribute)
prepare_inputs() (tensorrt_llm.models.BaichuanForCausalLM method)
(tensorrt_llm.models.BloomForCausalLM method)
(tensorrt_llm.models.ChatGLM2HeadModel method)
(tensorrt_llm.models.ChatGLM6BHeadModel method)
(tensorrt_llm.models.DecoderModel method)
(tensorrt_llm.models.EncoderModel method)
(tensorrt_llm.models.FalconForCausalLM method)
(tensorrt_llm.models.GPTJForCausalLM method)
(tensorrt_llm.models.GPTLMHeadModel method)
(tensorrt_llm.models.GPTNeoXForCausalLM method)
(tensorrt_llm.models.LLaMAForCausalLM method)
(tensorrt_llm.models.OPTLMHeadModel method)
presence_penalty (tensorrt_llm.runtime.SamplingConfig attribute)
PromptTuningEmbedding (class in tensorrt_llm.layers.embedding)
Q
quant_mode (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
QuantMode (class in tensorrt_llm.quantization)
R
random_seed (tensorrt_llm.runtime.SamplingConfig attribute)
rank() (tensorrt_llm.functional.Tensor method)
recv() (in module tensorrt_llm.functional)
relative (tensorrt_llm.functional.PositionEmbeddingType attribute)
relu() (in module tensorrt_llm.functional)
remove_input_padding (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
repetition_penalty (tensorrt_llm.runtime.SamplingConfig attribute)
replace_all_uses_with() (tensorrt_llm.functional.Tensor method)
RING (tensorrt_llm.functional.AllReduceStrategy attribute)
rms_norm() (in module tensorrt_llm.functional)
RmsNorm (class in tensorrt_llm.layers.normalization)
(tensorrt_llm.functional.LayerNormType attribute)
rope_gpt_neox (tensorrt_llm.functional.PositionEmbeddingType attribute)
rope_gptj (tensorrt_llm.functional.PositionEmbeddingType attribute)
RotaryScalingType (class in tensorrt_llm.functional)
round() (in module tensorrt_llm.functional)
RowLinear (class in tensorrt_llm.layers.linear)
run() (tensorrt_llm.runtime.Session method)
runtime (tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession attribute)
(tensorrt_llm.runtime.GenerationSession attribute)
(tensorrt_llm.runtime.Session property)
S
SamplingConfig (class in tensorrt_llm.runtime)
select() (in module tensorrt_llm.functional)
send() (in module tensorrt_llm.functional)
Session (class in tensorrt_llm.runtime)
setup() (tensorrt_llm.runtime.GenerationSession method)
shape (tensorrt_llm.functional.Tensor property)
(tensorrt_llm.runtime.TensorInfo attribute)
shape() (in module tensorrt_llm.functional)
sigmoid() (in module tensorrt_llm.functional)
silu() (in module tensorrt_llm.functional)
sin() (in module tensorrt_llm.functional)
size() (tensorrt_llm.functional.Tensor method)
slice() (in module tensorrt_llm.functional)
smooth_quantize() (in module tensorrt_llm.models)
softmax() (in module tensorrt_llm.functional)
softplus() (in module tensorrt_llm.functional)
split() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
sqrt() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
squared_relu() (in module tensorrt_llm.functional)
step() (tensorrt_llm.runtime.KVCacheManager method)
sub() (in module tensorrt_llm.functional)
swiglu() (in module tensorrt_llm.functional)
T
tanh() (in module tensorrt_llm.functional)
temperature (tensorrt_llm.runtime.SamplingConfig attribute)
Tensor (class in tensorrt_llm.functional)
TensorInfo (class in tensorrt_llm.runtime)
tensorrt_llm
module
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
tensorrt_llm (C++ type)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
,
[9]
,
[10]
,
[11]
,
[12]
,
[13]
,
[14]
,
[15]
,
[16]
,
[17]
,
[18]
,
[19]
,
[20]
,
[21]
tensorrt_llm.functional
module
tensorrt_llm.layers.activation
module
tensorrt_llm.layers.attention
module
tensorrt_llm.layers.cast
module
tensorrt_llm.layers.conv
module
tensorrt_llm.layers.embedding
module
tensorrt_llm.layers.linear
module
tensorrt_llm.layers.mlp
module
tensorrt_llm.layers.normalization
module
tensorrt_llm.layers.pooling
module
tensorrt_llm.models
module
tensorrt_llm.plugin
module
tensorrt_llm.quantization
module
tensorrt_llm.runtime
module
tensorrt_llm::batch_manager (C++ type)
tensorrt_llm::batch_manager::kv_cache_manager (C++ type)
tensorrt_llm::layers (C++ type)
tensorrt_llm::layers::DynamicDecodeLayer (C++ class)
tensorrt_llm::runtime (C++ type)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
,
[9]
,
[10]
,
[11]
,
[12]
,
[13]
,
[14]
,
[15]
,
[16]
,
[17]
,
[18]
,
[19]
,
[20]
,
[21]
tensorrt_llm::runtime::bufferCast (C++ function)
,
[1]
tensorrt_llm::runtime::BufferDataType (C++ class)
tensorrt_llm::runtime::BufferDataType::BufferDataType (C++ function)
tensorrt_llm::runtime::BufferDataType::getDataType (C++ function)
tensorrt_llm::runtime::BufferDataType::getSize (C++ function)
tensorrt_llm::runtime::BufferDataType::isPointer (C++ function)
tensorrt_llm::runtime::BufferDataType::isUnsigned (C++ function)
tensorrt_llm::runtime::BufferDataType::kTrtPointerType (C++ member)
tensorrt_llm::runtime::BufferDataType::mDataType (C++ member)
tensorrt_llm::runtime::BufferDataType::mPointer (C++ member)
tensorrt_llm::runtime::BufferDataType::mUnsigned (C++ member)
tensorrt_llm::runtime::BufferDataType::operator nvinfer1::DataType (C++ function)
tensorrt_llm::runtime::BufferManager (C++ class)
tensorrt_llm::runtime::BufferManager::allocate (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::BufferManager (C++ function)
tensorrt_llm::runtime::BufferManager::copy (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::BufferManager::copyFrom (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::BufferManager::cpu (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::BufferManager::emptyBuffer (C++ function)
tensorrt_llm::runtime::BufferManager::emptyTensor (C++ function)
tensorrt_llm::runtime::BufferManager::getStream (C++ function)
tensorrt_llm::runtime::BufferManager::gpu (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::IBufferPtr (C++ type)
tensorrt_llm::runtime::BufferManager::initMemoryPool (C++ function)
tensorrt_llm::runtime::BufferManager::ITensorPtr (C++ type)
tensorrt_llm::runtime::BufferManager::kBYTE_TYPE (C++ member)
tensorrt_llm::runtime::BufferManager::mStream (C++ member)
tensorrt_llm::runtime::BufferManager::pinned (C++ function)
,
[1]
tensorrt_llm::runtime::BufferManager::setZero (C++ function)
tensorrt_llm::runtime::BufferRange (C++ class)
tensorrt_llm::runtime::BufferRange::begin (C++ function)
,
[1]
tensorrt_llm::runtime::BufferRange::BufferRange (C++ function)
tensorrt_llm::runtime::BufferRange::cbegin (C++ function)
,
[1]
tensorrt_llm::runtime::BufferRange::cend (C++ function)
,
[1]
tensorrt_llm::runtime::BufferRange::const_iterator (C++ type)
tensorrt_llm::runtime::BufferRange::const_pointer (C++ type)
tensorrt_llm::runtime::BufferRange::const_reference (C++ type)
tensorrt_llm::runtime::BufferRange::end (C++ function)
,
[1]
tensorrt_llm::runtime::BufferRange::iterator (C++ type)
tensorrt_llm::runtime::BufferRange::mData (C++ member)
tensorrt_llm::runtime::BufferRange::mSize (C++ member)
tensorrt_llm::runtime::BufferRange::operator[] (C++ function)
,
[1]
tensorrt_llm::runtime::BufferRange::pointer (C++ type)
tensorrt_llm::runtime::BufferRange::reference (C++ type)
tensorrt_llm::runtime::BufferRange::size (C++ function)
tensorrt_llm::runtime::BufferRange::size_type (C++ type)
tensorrt_llm::runtime::BufferRange::value_type (C++ type)
tensorrt_llm::runtime::constPointerCast (C++ function)
,
[1]
tensorrt_llm::runtime::CppDataType (C++ struct)
tensorrt_llm::runtime::CppDataType<kDataType, kUnsigned, true> (C++ struct)
tensorrt_llm::runtime::CppDataType<kDataType, kUnsigned, true>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kBOOL, kUnsigned> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kBOOL, kUnsigned>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kFLOAT> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kFLOAT>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kHALF> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kHALF>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT32, true> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT32, true>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT32> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT32>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT64, true> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT64, true>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT64> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT64>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT8> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kINT8>::type (C++ type)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kUINT8, kUnsigned> (C++ struct)
tensorrt_llm::runtime::CppDataType<nvinfer1::DataType::kUINT8, kUnsigned>::type (C++ type)
tensorrt_llm::runtime::CudaEvent (C++ class)
tensorrt_llm::runtime::CudaEvent::CudaEvent (C++ function)
,
[1]
tensorrt_llm::runtime::CudaEvent::Deleter (C++ class)
tensorrt_llm::runtime::CudaEvent::Deleter::Deleter (C++ function)
,
[1]
tensorrt_llm::runtime::CudaEvent::Deleter::mOwnsEvent (C++ member)
tensorrt_llm::runtime::CudaEvent::Deleter::operator() (C++ function)
tensorrt_llm::runtime::CudaEvent::element_type (C++ type)
tensorrt_llm::runtime::CudaEvent::EventPtr (C++ type)
tensorrt_llm::runtime::CudaEvent::get (C++ function)
tensorrt_llm::runtime::CudaEvent::mEvent (C++ member)
tensorrt_llm::runtime::CudaEvent::pointer (C++ type)
tensorrt_llm::runtime::CudaEvent::synchronize (C++ function)
tensorrt_llm::runtime::CudaStream (C++ class)
tensorrt_llm::runtime::CudaStream::CudaStream (C++ function)
,
[1]
tensorrt_llm::runtime::CudaStream::Deleter (C++ class)
tensorrt_llm::runtime::CudaStream::Deleter::Deleter (C++ function)
,
[1]
tensorrt_llm::runtime::CudaStream::Deleter::mOwnsStream (C++ member)
tensorrt_llm::runtime::CudaStream::Deleter::operator() (C++ function)
tensorrt_llm::runtime::CudaStream::get (C++ function)
tensorrt_llm::runtime::CudaStream::getDevice (C++ function)
tensorrt_llm::runtime::CudaStream::mDevice (C++ member)
tensorrt_llm::runtime::CudaStream::mStream (C++ member)
tensorrt_llm::runtime::CudaStream::record (C++ function)
,
[1]
tensorrt_llm::runtime::CudaStream::StreamPtr (C++ type)
tensorrt_llm::runtime::CudaStream::synchronize (C++ function)
tensorrt_llm::runtime::CudaStream::wait (C++ function)
,
[1]
tensorrt_llm::runtime::decoder (C++ type)
tensorrt_llm::runtime::decoder::Input (C++ class)
tensorrt_llm::runtime::decoder::Input::cacheIndirection (C++ member)
tensorrt_llm::runtime::decoder::Input::Input (C++ function)
tensorrt_llm::runtime::decoder::Input::logits (C++ member)
tensorrt_llm::runtime::decoder::Input::TensorPtr (C++ type)
tensorrt_llm::runtime::decoder::Output (C++ class)
tensorrt_llm::runtime::decoder::Output::cacheIndirection (C++ member)
tensorrt_llm::runtime::decoder::Output::Output (C++ function)
tensorrt_llm::runtime::decoder::Output::sequenceLengths (C++ member)
tensorrt_llm::runtime::decoder::Output::TensorPtr (C++ type)
tensorrt_llm::runtime::decoder_batch (C++ type)
tensorrt_llm::runtime::decoder_batch::Input (C++ class)
tensorrt_llm::runtime::decoder_batch::Input::active (C++ member)
tensorrt_llm::runtime::decoder_batch::Input::Base (C++ type)
tensorrt_llm::runtime::decoder_batch::Input::Input (C++ function)
,
[1]
tensorrt_llm::runtime::decoder_batch::Output (C++ type)
tensorrt_llm::runtime::decoder_batch::Request (C++ class)
tensorrt_llm::runtime::decoder_batch::Request::badWordsList (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::embeddingBias (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::endId (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::ids (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::maxNewTokens (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::Request (C++ function)
tensorrt_llm::runtime::decoder_batch::Request::stopWordsList (C++ member)
tensorrt_llm::runtime::decoder_batch::Request::TensorPtr (C++ type)
tensorrt_llm::runtime::decoder_batch::Token (C++ class)
tensorrt_llm::runtime::decoder_batch::Token::active (C++ member)
tensorrt_llm::runtime::decoder_batch::Token::event (C++ member)
tensorrt_llm::runtime::decoder_batch::Token::Token (C++ function)
tensorrt_llm::runtime::DecodingInput (C++ class)
tensorrt_llm::runtime::DecodingInput::badWordsList (C++ member)
tensorrt_llm::runtime::DecodingInput::batchSize (C++ member)
tensorrt_llm::runtime::DecodingInput::cacheIndirection (C++ member)
tensorrt_llm::runtime::DecodingInput::DecodingInput (C++ function)
tensorrt_llm::runtime::DecodingInput::embeddingBias (C++ member)
tensorrt_llm::runtime::DecodingInput::endIds (C++ member)
tensorrt_llm::runtime::DecodingInput::lengths (C++ member)
tensorrt_llm::runtime::DecodingInput::logits (C++ member)
tensorrt_llm::runtime::DecodingInput::maxLength (C++ member)
tensorrt_llm::runtime::DecodingInput::noRepeatNgramSize (C++ member)
tensorrt_llm::runtime::DecodingInput::sequenceLimitLength (C++ member)
tensorrt_llm::runtime::DecodingInput::step (C++ member)
tensorrt_llm::runtime::DecodingInput::stopWordsList (C++ member)
tensorrt_llm::runtime::DecodingInput::TensorPtr (C++ type)
tensorrt_llm::runtime::DecodingOutput (C++ class)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses (C++ class)
tensorrt_llm::runtime::DecodingOutput::beamHypotheses (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::cumLogProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::empty (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::isDone (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::logProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::minNormedScores (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::normedScores (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::numBeams (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::outputIdsTgt (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::release (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape (C++ function)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::sequenceLengthsTgt (C++ member)
tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice (C++ function)
tensorrt_llm::runtime::DecodingOutput::cacheIndirection (C++ member)
tensorrt_llm::runtime::DecodingOutput::cumLogProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::DecodingOutput (C++ function)
tensorrt_llm::runtime::DecodingOutput::finished (C++ member)
tensorrt_llm::runtime::DecodingOutput::finishedSum (C++ member)
tensorrt_llm::runtime::DecodingOutput::ids (C++ member)
tensorrt_llm::runtime::DecodingOutput::kNegativeInfinity (C++ member)
tensorrt_llm::runtime::DecodingOutput::lengths (C++ member)
tensorrt_llm::runtime::DecodingOutput::logProbs (C++ member)
tensorrt_llm::runtime::DecodingOutput::newTokens (C++ member)
tensorrt_llm::runtime::DecodingOutput::parentIds (C++ member)
tensorrt_llm::runtime::DecodingOutput::TensorPtr (C++ type)
tensorrt_llm::runtime::GenerationInput (C++ class)
tensorrt_llm::runtime::GenerationInput::badWordsList (C++ member)
tensorrt_llm::runtime::GenerationInput::embeddingBiasOpt (C++ member)
tensorrt_llm::runtime::GenerationInput::endId (C++ member)
tensorrt_llm::runtime::GenerationInput::GenerationInput (C++ function)
tensorrt_llm::runtime::GenerationInput::ids (C++ member)
tensorrt_llm::runtime::GenerationInput::lengths (C++ member)
tensorrt_llm::runtime::GenerationInput::maxNewTokens (C++ member)
tensorrt_llm::runtime::GenerationInput::packed (C++ member)
tensorrt_llm::runtime::GenerationInput::padId (C++ member)
tensorrt_llm::runtime::GenerationInput::stopWordsList (C++ member)
tensorrt_llm::runtime::GenerationInput::TensorPtr (C++ type)
tensorrt_llm::runtime::GenerationOutput (C++ class)
tensorrt_llm::runtime::GenerationOutput::Callback (C++ type)
tensorrt_llm::runtime::GenerationOutput::contextLogits (C++ member)
tensorrt_llm::runtime::GenerationOutput::GenerationOutput (C++ function)
tensorrt_llm::runtime::GenerationOutput::ids (C++ member)
tensorrt_llm::runtime::GenerationOutput::logProbs (C++ member)
tensorrt_llm::runtime::GenerationOutput::onTokenGenerated (C++ member)
tensorrt_llm::runtime::GenerationOutput::TensorPtr (C++ type)
tensorrt_llm::runtime::GptDecoder (C++ class)
tensorrt_llm::runtime::GptDecoder::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::GptDecoder::forward (C++ function)
tensorrt_llm::runtime::GptDecoder::forwardAsync (C++ function)
tensorrt_llm::runtime::GptDecoder::GptDecoder (C++ function)
tensorrt_llm::runtime::GptDecoder::mAllocator (C++ member)
tensorrt_llm::runtime::GptDecoder::mDynamicDecodeLayer (C++ member)
tensorrt_llm::runtime::GptDecoder::mManager (C++ member)
tensorrt_llm::runtime::GptDecoder::setup (C++ function)
tensorrt_llm::runtime::GptDecoderBatch (C++ class)
tensorrt_llm::runtime::GptDecoderBatch::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::DecodingInputPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::DecodingOutputPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::forwardAsync (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::forwardSync (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getCumLogProbs (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getFinalOutputIds (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::getFinished (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getFinishedBeams (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getNbFinished (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getNbSteps (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getNewTokens (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getOutputIds (C++ function)
,
[1]
tensorrt_llm::runtime::GptDecoderBatch::getOutputLengths (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::getParentIds (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::GptDecoderBatch (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::GptDecoderPtr (C++ type)
tensorrt_llm::runtime::GptDecoderBatch::isFinishedSync (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::mActualBatchSize (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBeamWidths (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mBufferManager (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDecoders (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDecodingInputs (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mDecodingOutputs (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mFinished (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mFinishedSum (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mForwardEvent (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mForwardToken (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mJointDecodingInput (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mJointDecodingOutput (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxNewTokens (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mMaxSequenceLength (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mNbSteps (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mStream (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mStreams (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mVocabSize (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::mVocabSizePadded (C++ member)
tensorrt_llm::runtime::GptDecoderBatch::newBatch (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::newRequest (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::postProcessRequest (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::setup (C++ function)
tensorrt_llm::runtime::GptDecoderBatch::TensorPtr (C++ type)
tensorrt_llm::runtime::GptJsonConfig (C++ class)
tensorrt_llm::runtime::GptJsonConfig::engineFilename (C++ function)
,
[1]
tensorrt_llm::runtime::GptJsonConfig::getModelConfig (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getName (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getPipelineParallelism (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getPrecision (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getTensorParallelism (C++ function)
tensorrt_llm::runtime::GptJsonConfig::getWorldSize (C++ function)
tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig (C++ function)
tensorrt_llm::runtime::GptJsonConfig::mGptModelConfig (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mName (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mPipelineParallelism (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mPrecision (C++ member)
tensorrt_llm::runtime::GptJsonConfig::mTensorParallelism (C++ member)
tensorrt_llm::runtime::GptJsonConfig::parse (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::GptModelConfig (C++ class)
tensorrt_llm::runtime::GptModelConfig::computeContextLogits (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::getDataType (C++ function)
tensorrt_llm::runtime::GptModelConfig::getHiddenSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxBatchSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxInputLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxNumTokens (C++ function)
tensorrt_llm::runtime::GptModelConfig::getMaxOutputLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::getModelVariant (C++ function)
tensorrt_llm::runtime::GptModelConfig::getNbHeads (C++ function)
tensorrt_llm::runtime::GptModelConfig::getNbKvHeads (C++ function)
tensorrt_llm::runtime::GptModelConfig::getNbLayers (C++ function)
tensorrt_llm::runtime::GptModelConfig::getQuantMode (C++ function)
tensorrt_llm::runtime::GptModelConfig::getSizePerHead (C++ function)
tensorrt_llm::runtime::GptModelConfig::getTokensPerBlock (C++ function)
tensorrt_llm::runtime::GptModelConfig::getVocabSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::getVocabSizePadded (C++ function)
tensorrt_llm::runtime::GptModelConfig::GptModelConfig (C++ function)
tensorrt_llm::runtime::GptModelConfig::mComputeContextLogits (C++ member)
tensorrt_llm::runtime::GptModelConfig::mDataType (C++ member)
tensorrt_llm::runtime::GptModelConfig::mHiddenSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::mInputPacked (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxBatchSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxInputLen (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxNumTokens (C++ member)
tensorrt_llm::runtime::GptModelConfig::mMaxOutputLen (C++ member)
tensorrt_llm::runtime::GptModelConfig::mModelVariant (C++ member)
tensorrt_llm::runtime::GptModelConfig::mNbHeads (C++ member)
tensorrt_llm::runtime::GptModelConfig::mNbKvHeads (C++ member)
tensorrt_llm::runtime::GptModelConfig::mNbLayers (C++ member)
tensorrt_llm::runtime::GptModelConfig::ModelVariant (C++ enum)
tensorrt_llm::runtime::GptModelConfig::ModelVariant::kGlm (C++ enumerator)
tensorrt_llm::runtime::GptModelConfig::ModelVariant::kGpt (C++ enumerator)
tensorrt_llm::runtime::GptModelConfig::mPagedKvCache (C++ member)
tensorrt_llm::runtime::GptModelConfig::mQuantMode (C++ member)
tensorrt_llm::runtime::GptModelConfig::mTokensPerBlock (C++ member)
tensorrt_llm::runtime::GptModelConfig::mUseCustomAllReduce (C++ member)
tensorrt_llm::runtime::GptModelConfig::mUseGptAttentionPlugin (C++ member)
tensorrt_llm::runtime::GptModelConfig::mVocabSize (C++ member)
tensorrt_llm::runtime::GptModelConfig::setMaxBatchSize (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxInputLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxNumTokens (C++ function)
tensorrt_llm::runtime::GptModelConfig::setMaxOutputLen (C++ function)
tensorrt_llm::runtime::GptModelConfig::setModelVariant (C++ function)
tensorrt_llm::runtime::GptModelConfig::setNbKvHeads (C++ function)
tensorrt_llm::runtime::GptModelConfig::setQuantMode (C++ function)
tensorrt_llm::runtime::GptModelConfig::setTokensPerBlock (C++ function)
tensorrt_llm::runtime::GptModelConfig::supportsInflightBatching (C++ function)
tensorrt_llm::runtime::GptModelConfig::useCustomAllReduce (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::useGptAttentionPlugin (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::usePackedInput (C++ function)
,
[1]
tensorrt_llm::runtime::GptModelConfig::usePagedKvCache (C++ function)
,
[1]
tensorrt_llm::runtime::GptSession (C++ class)
tensorrt_llm::runtime::GptSession::createBuffers (C++ function)
tensorrt_llm::runtime::GptSession::createContexts (C++ function)
tensorrt_llm::runtime::GptSession::createCustomAllReduceWorkspace (C++ function)
tensorrt_llm::runtime::GptSession::createDecoders (C++ function)
tensorrt_llm::runtime::GptSession::createKvCacheManagers (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor (C++ class)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::clear (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::create (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::cudaGraphExecPtr (C++ type)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::CudaGraphExecutor (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::hasInstance (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::launch (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::mInstance (C++ member)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::prepareNextGraph (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::update (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::uploadToStream (C++ function)
tensorrt_llm::runtime::GptSession::CudaGraphExecutor::~CudaGraphExecutor (C++ function)
tensorrt_llm::runtime::GptSession::decoderStepAsync (C++ function)
tensorrt_llm::runtime::GptSession::finalizeOutputIds (C++ function)
tensorrt_llm::runtime::GptSession::generate (C++ function)
tensorrt_llm::runtime::GptSession::generateMultiBatch (C++ function)
tensorrt_llm::runtime::GptSession::generateSingleBatch (C++ function)
tensorrt_llm::runtime::GptSession::getBufferManager (C++ function)
tensorrt_llm::runtime::GptSession::getDevice (C++ function)
tensorrt_llm::runtime::GptSession::getLogger (C++ function)
tensorrt_llm::runtime::GptSession::getModelConfig (C++ function)
tensorrt_llm::runtime::GptSession::getWorldConfig (C++ function)
tensorrt_llm::runtime::GptSession::GptSession (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::GptSession::initNewTokens (C++ function)
tensorrt_llm::runtime::GptSession::isCudaGraphMode (C++ function)
tensorrt_llm::runtime::GptSession::kvCacheAddSequences (C++ function)
tensorrt_llm::runtime::GptSession::KvCacheManager (C++ type)
tensorrt_llm::runtime::GptSession::LoggerPtr (C++ type)
tensorrt_llm::runtime::GptSession::mBuffers (C++ member)
tensorrt_llm::runtime::GptSession::mCommEvent (C++ member)
tensorrt_llm::runtime::GptSession::mCommStream (C++ member)
tensorrt_llm::runtime::GptSession::mCudaGraphInstances (C++ member)
tensorrt_llm::runtime::GptSession::mCudaGraphMode (C++ member)
tensorrt_llm::runtime::GptSession::mDecoderMaxSequenceLength (C++ member)
tensorrt_llm::runtime::GptSession::mDecoders (C++ member)
tensorrt_llm::runtime::GptSession::mDevice (C++ member)
tensorrt_llm::runtime::GptSession::mKvCacheManagers (C++ member)
tensorrt_llm::runtime::GptSession::mLogger (C++ member)
tensorrt_llm::runtime::GptSession::mModelConfig (C++ member)
tensorrt_llm::runtime::GptSession::mNumMicroBatches (C++ member)
tensorrt_llm::runtime::GptSession::mPipelineComm (C++ member)
tensorrt_llm::runtime::GptSession::mReceivedEvents (C++ member)
tensorrt_llm::runtime::GptSession::mRuntime (C++ member)
tensorrt_llm::runtime::GptSession::mWorldConfig (C++ member)
tensorrt_llm::runtime::GptSession::setCudaGraphMode (C++ function)
tensorrt_llm::runtime::GptSession::setup (C++ function)
tensorrt_llm::runtime::GptSession::shouldStopSync (C++ function)
tensorrt_llm::runtime::IBuffer (C++ class)
tensorrt_llm::runtime::IBuffer::data (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::runtime::IBuffer::DataType (C++ type)
tensorrt_llm::runtime::IBuffer::getCapacity (C++ function)
tensorrt_llm::runtime::IBuffer::getDataType (C++ function)
tensorrt_llm::runtime::IBuffer::getMemoryType (C++ function)
tensorrt_llm::runtime::IBuffer::getSize (C++ function)
tensorrt_llm::runtime::IBuffer::getSizeInBytes (C++ function)
tensorrt_llm::runtime::IBuffer::IBuffer (C++ function)
,
[1]
tensorrt_llm::runtime::IBuffer::memoryType (C++ function)
tensorrt_llm::runtime::IBuffer::operator= (C++ function)
tensorrt_llm::runtime::IBuffer::release (C++ function)
tensorrt_llm::runtime::IBuffer::resize (C++ function)
tensorrt_llm::runtime::IBuffer::SharedConstPtr (C++ type)
tensorrt_llm::runtime::IBuffer::SharedPtr (C++ type)
tensorrt_llm::runtime::IBuffer::slice (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::runtime::IBuffer::toBytes (C++ function)
tensorrt_llm::runtime::IBuffer::UniqueConstPtr (C++ type)
tensorrt_llm::runtime::IBuffer::UniquePtr (C++ type)
tensorrt_llm::runtime::IBuffer::view (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::IBuffer::wrap (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::IBuffer::~IBuffer (C++ function)
tensorrt_llm::runtime::IGptDecoder (C++ class)
tensorrt_llm::runtime::IGptDecoder::create (C++ function)
tensorrt_llm::runtime::IGptDecoder::forward (C++ function)
tensorrt_llm::runtime::IGptDecoder::forwardAsync (C++ function)
tensorrt_llm::runtime::IGptDecoder::gatherTree (C++ function)
tensorrt_llm::runtime::IGptDecoder::setup (C++ function)
tensorrt_llm::runtime::IGptDecoder::~IGptDecoder (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch (C++ class)
tensorrt_llm::runtime::IGptDecoderBatch::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::IGptDecoderBatch::forward (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::forwardAsync (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::forwardSync (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getCumLogProbs (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getFinalOutputIds (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getFinished (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getFinishedBeams (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getNbSteps (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getOutputIds (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getOutputLengths (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::getParentIds (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::IGptDecoderBatch (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::newRequest (C++ function)
tensorrt_llm::runtime::IGptDecoderBatch::TensorPtr (C++ type)
tensorrt_llm::runtime::IGptDecoderBatch::TokenPtr (C++ type)
tensorrt_llm::runtime::IpcMemory (C++ class)
tensorrt_llm::runtime::IpcMemory::allocateIpcMemory (C++ function)
tensorrt_llm::runtime::IpcMemory::destroyIpcMemory (C++ function)
tensorrt_llm::runtime::IpcMemory::FLAGS_SIZE (C++ member)
tensorrt_llm::runtime::IpcMemory::getCommPtrsTensor (C++ function)
tensorrt_llm::runtime::IpcMemory::IpcMemory (C++ function)
tensorrt_llm::runtime::IpcMemory::mBufferPtr (C++ member)
tensorrt_llm::runtime::IpcMemory::mBufferSize (C++ member)
tensorrt_llm::runtime::IpcMemory::mCommPtrs (C++ member)
tensorrt_llm::runtime::IpcMemory::mWorldConfig (C++ member)
tensorrt_llm::runtime::IpcMemory::TensorPtr (C++ type)
tensorrt_llm::runtime::IpcMemory::~IpcMemory (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder (C++ class)
tensorrt_llm::runtime::IStatefulGptDecoder::CudaStreamPtr (C++ type)
tensorrt_llm::runtime::IStatefulGptDecoder::forward (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::forwardAsync (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getFinalOutputIds (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getNbFinished (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getNewTokens (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::getOutputIds (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::isFinishedSync (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::IStatefulGptDecoder (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::newBatch (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::setup (C++ function)
tensorrt_llm::runtime::IStatefulGptDecoder::TensorPtr (C++ type)
tensorrt_llm::runtime::ITensor (C++ class)
tensorrt_llm::runtime::ITensor::getShape (C++ function)
tensorrt_llm::runtime::ITensor::ITensor (C++ function)
,
[1]
tensorrt_llm::runtime::ITensor::makeShape (C++ function)
tensorrt_llm::runtime::ITensor::operator= (C++ function)
tensorrt_llm::runtime::ITensor::reshape (C++ function)
tensorrt_llm::runtime::ITensor::Shape (C++ type)
tensorrt_llm::runtime::ITensor::SharedConstPtr (C++ type)
tensorrt_llm::runtime::ITensor::SharedPtr (C++ type)
tensorrt_llm::runtime::ITensor::slice (C++ function)
,
[1]
,
[2]
,
[3]
tensorrt_llm::runtime::ITensor::squeeze (C++ function)
,
[1]
tensorrt_llm::runtime::ITensor::toString (C++ function)
tensorrt_llm::runtime::ITensor::UniqueConstPtr (C++ type)
tensorrt_llm::runtime::ITensor::UniquePtr (C++ type)
tensorrt_llm::runtime::ITensor::view (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::ITensor::volume (C++ function)
tensorrt_llm::runtime::ITensor::volumeNonNegative (C++ function)
tensorrt_llm::runtime::ITensor::wrap (C++ function)
,
[1]
,
[2]
,
[3]
,
[4]
tensorrt_llm::runtime::ITensor::~ITensor (C++ function)
tensorrt_llm::runtime::MemoryCounters (C++ class)
tensorrt_llm::runtime::MemoryCounters::allocate (C++ function)
,
[1]
tensorrt_llm::runtime::MemoryCounters::bytesToString (C++ function)
,
[1]
tensorrt_llm::runtime::MemoryCounters::deallocate (C++ function)
,
[1]
tensorrt_llm::runtime::MemoryCounters::DiffType (C++ type)
tensorrt_llm::runtime::MemoryCounters::getCpu (C++ function)
tensorrt_llm::runtime::MemoryCounters::getCpuDiff (C++ function)
tensorrt_llm::runtime::MemoryCounters::getGpu (C++ function)
tensorrt_llm::runtime::MemoryCounters::getGpuDiff (C++ function)
tensorrt_llm::runtime::MemoryCounters::getInstance (C++ function)
tensorrt_llm::runtime::MemoryCounters::getPinned (C++ function)
tensorrt_llm::runtime::MemoryCounters::getPinnedDiff (C++ function)
tensorrt_llm::runtime::MemoryCounters::mCpu (C++ member)
tensorrt_llm::runtime::MemoryCounters::mCpuDiff (C++ member)
tensorrt_llm::runtime::MemoryCounters::MemoryCounters (C++ function)
tensorrt_llm::runtime::MemoryCounters::mGpu (C++ member)
tensorrt_llm::runtime::MemoryCounters::mGpuDiff (C++ member)
tensorrt_llm::runtime::MemoryCounters::mInstance (C++ member)
tensorrt_llm::runtime::MemoryCounters::mPinned (C++ member)
tensorrt_llm::runtime::MemoryCounters::mPinnedDiff (C++ member)
tensorrt_llm::runtime::MemoryCounters::SizeType (C++ type)
tensorrt_llm::runtime::MemoryType (C++ enum)
tensorrt_llm::runtime::MemoryType::kCPU (C++ enumerator)
tensorrt_llm::runtime::MemoryType::kGPU (C++ enumerator)
tensorrt_llm::runtime::MemoryType::kPINNED (C++ enumerator)
tensorrt_llm::runtime::MemoryTypeString (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kCPU> (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kCPU>::value (C++ member)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kGPU> (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kGPU>::value (C++ member)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kPINNED> (C++ struct)
tensorrt_llm::runtime::MemoryTypeString<MemoryType::kPINNED>::value (C++ member)
tensorrt_llm::runtime::operator<< (C++ function)
,
[1]
,
[2]
tensorrt_llm::runtime::PhonyNameDueToError::type (C++ type)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
tensorrt_llm::runtime::PhonyNameDueToError::value (C++ member)
,
[1]
,
[2]
,
[3]
,
[4]
,
[5]
,
[6]
,
[7]
,
[8]
tensorrt_llm::runtime::PointerElementType (C++ type)
tensorrt_llm::runtime::SamplingConfig (C++ class)
tensorrt_llm::runtime::SamplingConfig::beamSearchDiversityRate (C++ member)
tensorrt_llm::runtime::SamplingConfig::beamWidth (C++ member)
tensorrt_llm::runtime::SamplingConfig::FloatType (C++ type)
tensorrt_llm::runtime::SamplingConfig::lengthPenalty (C++ member)
tensorrt_llm::runtime::SamplingConfig::minLength (C++ member)
tensorrt_llm::runtime::SamplingConfig::OptVec (C++ type)
tensorrt_llm::runtime::SamplingConfig::presencePenalty (C++ member)
tensorrt_llm::runtime::SamplingConfig::randomSeed (C++ member)
tensorrt_llm::runtime::SamplingConfig::repetitionPenalty (C++ member)
tensorrt_llm::runtime::SamplingConfig::SamplingConfig (C++ function)
tensorrt_llm::runtime::SamplingConfig::temperature (C++ member)
tensorrt_llm::runtime::SamplingConfig::topK (C++ member)
tensorrt_llm::runtime::SamplingConfig::topP (C++ member)
tensorrt_llm::runtime::SamplingConfig::topPDecay (C++ member)
tensorrt_llm::runtime::SamplingConfig::topPMin (C++ member)
tensorrt_llm::runtime::SamplingConfig::topPResetIds (C++ member)
tensorrt_llm::runtime::setPeerAccess (C++ function)
tensorrt_llm::runtime::SizeType (C++ type)
tensorrt_llm::runtime::StringPtrMap (C++ type)
tensorrt_llm::runtime::TllmLogger (C++ class)
tensorrt_llm::runtime::TllmLogger::getLevel (C++ function)
tensorrt_llm::runtime::TllmLogger::log (C++ function)
tensorrt_llm::runtime::TllmLogger::setLevel (C++ function)
tensorrt_llm::runtime::TokenIdType (C++ type)
tensorrt_llm::runtime::TRTDataType (C++ struct)
tensorrt_llm::runtime::TRTDataType<bool> (C++ struct)
tensorrt_llm::runtime::TRTDataType<bool>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<float> (C++ struct)
tensorrt_llm::runtime::TRTDataType<float>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<half> (C++ struct)
tensorrt_llm::runtime::TRTDataType<half>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::int32_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::int32_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::int64_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::int64_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::int8_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::int8_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::uint32_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::uint32_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::uint64_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::uint64_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<std::uint8_t> (C++ struct)
tensorrt_llm::runtime::TRTDataType<std::uint8_t>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<T*> (C++ struct)
tensorrt_llm::runtime::TRTDataType<T*>::kUnderlyingType (C++ member)
tensorrt_llm::runtime::TRTDataType<T*>::value (C++ member)
tensorrt_llm::runtime::TRTDataType<void*> (C++ struct)
tensorrt_llm::runtime::TRTDataType<void*>::value (C++ member)
tensorrt_llm::runtime::utils (C++ type)
tensorrt_llm::runtime::utils::loadEngine (C++ function)
tensorrt_llm::runtime::WorldConfig (C++ class)
tensorrt_llm::runtime::WorldConfig::getDevice (C++ function)
tensorrt_llm::runtime::WorldConfig::getGpusPerNode (C++ function)
tensorrt_llm::runtime::WorldConfig::getPipelineParallelGroup (C++ function)
tensorrt_llm::runtime::WorldConfig::getPipelineParallelism (C++ function)
tensorrt_llm::runtime::WorldConfig::getPipelineParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::getRank (C++ function)
tensorrt_llm::runtime::WorldConfig::getSize (C++ function)
tensorrt_llm::runtime::WorldConfig::getTensorParallelism (C++ function)
tensorrt_llm::runtime::WorldConfig::getTensorParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::isFirstPipelineParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::isLastPipelineParallelRank (C++ function)
tensorrt_llm::runtime::WorldConfig::isPipelineParallel (C++ function)
tensorrt_llm::runtime::WorldConfig::isTensorParallel (C++ function)
tensorrt_llm::runtime::WorldConfig::kDefaultGpusPerNode (C++ member)
tensorrt_llm::runtime::WorldConfig::mGpusPerNode (C++ member)
tensorrt_llm::runtime::WorldConfig::mpi (C++ function)
,
[1]
tensorrt_llm::runtime::WorldConfig::mPipelineParallelism (C++ member)
tensorrt_llm::runtime::WorldConfig::mRank (C++ member)
tensorrt_llm::runtime::WorldConfig::mTensorParallelism (C++ member)
tensorrt_llm::runtime::WorldConfig::validConfig (C++ function)
tensorrt_llm::runtime::WorldConfig::WorldConfig (C++ function)
to_word_list_format() (in module tensorrt_llm.runtime)
tokens_per_block (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
top_k (tensorrt_llm.runtime.SamplingConfig attribute)
top_p (tensorrt_llm.runtime.SamplingConfig attribute)
transpose() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
TWOSHOT (tensorrt_llm.functional.AllReduceStrategy attribute)
U
unary() (in module tensorrt_llm.functional)
unsqueeze() (in module tensorrt_llm.functional)
use_beam_hyps (tensorrt_llm.runtime.SamplingConfig attribute)
use_custom_all_reduce (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
use_gpt_attention_plugin (tensorrt_llm.runtime.GenerationSession property)
use_prompt_tuning (tensorrt_llm.runtime.ModelConfig attribute)
V
view() (in module tensorrt_llm.functional)
(tensorrt_llm.functional.Tensor method)
vocab_size (tensorrt_llm.runtime.GenerationSession property)
(tensorrt_llm.runtime.ModelConfig attribute)
W
weight_only_groupwise_quantize() (in module tensorrt_llm.models)
weight_only_quantize() (in module tensorrt_llm.models)
where() (in module tensorrt_llm.functional)