Runtime

class tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: GenerationSession

batch_size: int
buffer_allocated: bool
cuda_graph_mode: bool
debug_mode: bool
debug_tensors_to_save: None
device: device
mapping: Mapping
runtime: _Runtime
class tensorrt_llm.runtime.GenerationSequence(seq_idx, batch_idx)[source]

Bases: object

get_batch_idx() int[source]

Returns idx of sequence in batch

get_seq_idx() int[source]

Returns sequence idx

class tensorrt_llm.runtime.GenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: object

batch_size: int
buffer_allocated: bool
property cross_attention
cuda_graph_mode: bool
cuda_stream_guard()[source]

Sync external stream and set current stream to the one bound to the session. Reset on exit.

debug_mode: bool
debug_tensors_to_save: None
decode(input_ids: Tensor, context_lengths: Tensor, sampling_config: SamplingConfig, prompt_embedding_table: Tensor = None, tasks: Tensor = None, prompt_vocab_size: Tensor = None, stop_words_list=None, bad_words_list=None, no_repeat_ngram_size=None, streaming: bool = False, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor = None, encoder_input_lengths: Tensor = None)[source]
decode_batch(input_ids: Sequence[Tensor], sampling_config: SamplingConfig, streaming: bool = False)[source]
decode_regular(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_list, bad_words_list, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None)[source]
decode_stream(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_list, bad_words_list, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None)[source]
device: device
property dtype
finalize_decoder(context_lengths, batch_size, beam_width, scfg)[source]
property first_layer
property gather_all_token_logits
handle_per_step(cache_indirections: list, step: int, batch_size: int, max_context_length: int, beam_width: int, input_ids: Tensor, hidden_states: Tensor, scfg: SamplingConfig, kv_cache_block_pointers: list, prompt_embedding_table: Tensor, tasks: Tensor, context_lengths: Tensor, host_context_lengths, attention_mask: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, sequence_lengths: Tensor, next_step_buffer: dict, stop_words_list, bad_words_list, no_repeat_ngram_size, encoder_output: Tensor, encoder_input_lengths: Tensor)[source]
property has_position_embedding
property has_token_type_embedding
property head_size
property hidden_size
property last_layer
mapping: Mapping
property num_heads
property num_heads_kv
property num_layers
property paged_kv_cache
pp_communicate_final_output_ids(final_output_ids, batch_size, beam_width)[source]
pp_communicate_new_tokens(should_stop, cache_indir, sequence_length)[source]
property quant_mode
property remove_input_padding
runtime: _Runtime
setup(batch_size: int, max_context_length: int, max_new_tokens: int, beam_width: int = 1, encoder_max_input_length: int | None = None)[source]
property tokens_per_block
property use_custom_all_reduce
property use_gpt_attention_plugin
property vocab_size
class tensorrt_llm.runtime.KVCacheManager(memory_pools: List[Tensor], blocks: int, tokens_per_block: int, max_blocks_per_seq: int, beam_width: int = 1)[source]

Bases: object

add_sequence(sequence: GenerationSequence, context_len: int)[source]

Add sequence to the manager and allocate minimum amount of blocks for context

get_pointer_arrays(beam_width: int) List[Tensor][source]

Returns arrays of pointers for all memory pools copied to GPU

step(finished: List[bool])[source]

Iterate to the next generation step. Add new blocks where needed and clear finished sequences.

class tensorrt_llm.runtime.ModelConfig(vocab_size: int, num_layers: int, num_heads: int, num_kv_heads: int, hidden_size: int, gpt_attention_plugin: bool, remove_input_padding: bool = False, model_name: str = '', paged_kv_cache: bool = False, cross_attention: bool = False, has_position_embedding: bool = True, has_token_type_embedding: bool = False, tokens_per_block: int = 64, use_prompt_tuning: bool = False, quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode.0: 0>, gather_all_token_logits: bool = False, dtype: str = '', use_custom_all_reduce: bool = False)[source]

Bases: object

cross_attention: bool = False
dtype: str = ''
gather_all_token_logits: bool = False
gpt_attention_plugin: bool
has_position_embedding: bool = True
has_token_type_embedding: bool = False
hidden_size: int
model_name: str = ''
num_heads: int
num_kv_heads: int
num_layers: int
paged_kv_cache: bool = False
quant_mode: QuantMode = 0
remove_input_padding: bool = False
tokens_per_block: int = 64
use_custom_all_reduce: bool = False
use_prompt_tuning: bool = False
vocab_size: int
class tensorrt_llm.runtime.SamplingConfig(end_id: int, pad_id: int, num_beams: int = 1, temperature: float | torch.Tensor = 1.0, top_k: int | torch.Tensor = 1, top_p: float | torch.Tensor = 0.0, length_penalty: float | torch.Tensor = 1.0, repetition_penalty: float | torch.Tensor = 1.0, min_length: int | torch.Tensor = 1, presence_penalty: float | torch.Tensor = 0.0, use_beam_hyps: bool = True)[source]

Bases: object

beam_search_diversity_rate: float | Tensor = None
end_id: int
length_penalty: float | Tensor = 1.0
min_length: int | Tensor = 1
num_beams: int = 1
output_cum_log_probs: bool = False
output_log_probs: bool = False
pad_id: int
presence_penalty: float | Tensor = 0.0
random_seed: int | Tensor = None
repetition_penalty: float | Tensor = 1.0
temperature: float | Tensor = 1.0
top_k: int | Tensor = 1
top_p: float | Tensor = 0.0
use_beam_hyps: bool = True
class tensorrt_llm.runtime.Session(**kwargs)[source]

Bases: object

Session is a managed TensorRT runtime.

property context: IExecutionContext
Get the default TensorRT execution context,

use self.engine.create_execution_context() to create a new context if needed

@return: one TensorRT execution context object

Type:

@brief

property engine: ICudaEngine
static from_engine(engine) Session[source]

@brief: Create a session from an existing ICudaEngine engine @param engine: an ICudaEngine @return: a Session object

static from_serialized_engine(engine) Session[source]

@brief: Create a session from a serialized engine @param engine: a serialized engine @return: a Session object

infer_shapes(inputs: List[TensorInfo], context=None) List[TensorInfo][source]
@brief: Set input shapes to given context, and infer the output shapes from the given input shapes.

This function should be called every time when the input shapes are changed before calling run(). Or call the context.set_input_shape on all dynamic shaped input tensors manually.

@param inputs: list of TensorInfo object, each item represents an input tensor @param context: TensorRT execution context, if None, use the default context @return: list of TensorInfo object, each item represents an output tensor, returns None if failed

run(inputs: Dict[str, Any], outputs: Dict[str, Any], stream, context=None) bool[source]

@brief: Run the TensorRT engine with the given inputs and outputs @param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor @param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor @param stream: cuda stream to enqueue the TensorRT engine on @param context: TensorRT execution context, if None, use the default context @return: True if enqueue succeeded, note the enqueue is an async call,

returning True does not mean the execution is finished

property runtime: Runtime
class tensorrt_llm.runtime.TensorInfo(name: 'str', dtype: 'trt.DataType', shape: 'tuple')[source]

Bases: object

dtype: DataType
name: str
shape: tuple
tensorrt_llm.runtime.to_word_list_format(word_dict: List[List[str]], tokenizer=None)[source]
format of word_dict

len(word_dict) should be same to batch_size word_dict[i] means the words for batch i len(word_dict[i]) must be 1, which means it only contains 1 string This string can contains several sentences and split by “,”. For example, if word_dict[2] = ” I am happy, I am sad”, then this function will return the ids for two short sentences ” I am happy” and ” I am sad”.