Runtime

class tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: GenerationSession

batch_size: int

buffer_allocated: bool

cuda_graph_mode: bool

debug_mode: bool

debug_tensors_to_save: None

device: device

mapping: Mapping

runtime: _Runtime

class tensorrt_llm.runtime.GenerationSequence(seq_idx, batch_idx)[source]

Bases: object

get_batch_idx() → int[source]: Returns idx of sequence in batch

get_seq_idx() → int[source]: Returns sequence idx

class tensorrt_llm.runtime.GenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: object

batch_size: int

buffer_allocated: bool

property cross_attention

cuda_graph_mode: bool

cuda_stream_guard()[source]: Sync external stream and set current stream to the one bound to the session. Reset on exit.

debug_mode: bool

debug_tensors_to_save: None

decode(input_ids: Tensor, context_lengths: Tensor, sampling_config: SamplingConfig, prompt_embedding_table: Tensor = None, tasks: Tensor = None, prompt_vocab_size: Tensor = None, stop_words_list=None, bad_words_list=None, no_repeat_ngram_size=None, streaming: bool = False, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor = None, encoder_input_lengths: Tensor = None)[source]

decode_batch(input_ids: Sequence[Tensor], sampling_config: SamplingConfig, streaming: bool = False)[source]

decode_regular(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_list, bad_words_list, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None)[source]

decode_stream(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_list, bad_words_list, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None)[source]

device: device

property dtype

finalize_decoder(context_lengths, batch_size, beam_width, scfg)[source]

property first_layer

property gather_all_token_logits

handle_per_step(cache_indirections: list, step: int, batch_size: int, max_context_length: int, beam_width: int, input_ids: Tensor, hidden_states: Tensor, scfg: SamplingConfig, kv_cache_block_pointers: list, prompt_embedding_table: Tensor, tasks: Tensor, context_lengths: Tensor, host_context_lengths, attention_mask: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, sequence_lengths: Tensor, next_step_buffer: dict, stop_words_list, bad_words_list, no_repeat_ngram_size, encoder_output: Tensor, encoder_input_lengths: Tensor)[source]

property has_position_embedding

property has_token_type_embedding

property head_size

property hidden_size

property last_layer

mapping: Mapping

property num_heads

property num_heads_kv

property num_layers

property paged_kv_cache

pp_communicate_final_output_ids(final_output_ids, batch_size, beam_width)[source]

pp_communicate_new_tokens(should_stop, cache_indir, sequence_length)[source]

property quant_mode

property remove_input_padding

runtime: _Runtime

setup(batch_size: int, max_context_length: int, max_new_tokens: int, beam_width: int = 1, encoder_max_input_length: int | None = None)[source]

property tokens_per_block

property use_custom_all_reduce

property use_gpt_attention_plugin

property vocab_size

class tensorrt_llm.runtime.KVCacheManager(memory_pools: List[Tensor], blocks: int, tokens_per_block: int, max_blocks_per_seq: int, beam_width: int = 1)[source]

Bases: object

add_sequence(sequence: GenerationSequence, context_len: int)[source]: Add sequence to the manager and allocate minimum amount of blocks for context

get_pointer_arrays(beam_width: int) → List[Tensor][source]: Returns arrays of pointers for all memory pools copied to GPU

step(finished: List[bool])[source]: Iterate to the next generation step. Add new blocks where needed and clear finished sequences.

class tensorrt_llm.runtime.ModelConfig(vocab_size: int, num_layers: int, num_heads: int, num_kv_heads: int, hidden_size: int, gpt_attention_plugin: bool, remove_input_padding: bool = False, model_name: str = '', paged_kv_cache: bool = False, cross_attention: bool = False, has_position_embedding: bool = True, has_token_type_embedding: bool = False, tokens_per_block: int = 64, use_prompt_tuning: bool = False, quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode.0: 0>, gather_all_token_logits: bool = False, dtype: str = '', use_custom_all_reduce: bool = False)[source]

Bases: object

cross_attention: bool = False

dtype: str = ''

gather_all_token_logits: bool = False

gpt_attention_plugin: bool

has_position_embedding: bool = True

has_token_type_embedding: bool = False

hidden_size: int

model_name: str = ''

num_heads: int

num_kv_heads: int

num_layers: int

paged_kv_cache: bool = False

quant_mode: QuantMode = 0

remove_input_padding: bool = False

tokens_per_block: int = 64

use_custom_all_reduce: bool = False

use_prompt_tuning: bool = False

vocab_size: int

class tensorrt_llm.runtime.SamplingConfig(end_id: int, pad_id: int, num_beams: int = 1, temperature: float | torch.Tensor = 1.0, top_k: int | torch.Tensor = 1, top_p: float | torch.Tensor = 0.0, length_penalty: float | torch.Tensor = 1.0, repetition_penalty: float | torch.Tensor = 1.0, min_length: int | torch.Tensor = 1, presence_penalty: float | torch.Tensor = 0.0, use_beam_hyps: bool = True)[source]

Bases: object

beam_search_diversity_rate: float | Tensor = None

end_id: int

length_penalty: float | Tensor = 1.0

min_length: int | Tensor = 1

num_beams: int = 1

output_cum_log_probs: bool = False

output_log_probs: bool = False

pad_id: int

presence_penalty: float | Tensor = 0.0

random_seed: int | Tensor = None

repetition_penalty: float | Tensor = 1.0

temperature: float | Tensor = 1.0

top_k: int | Tensor = 1

top_p: float | Tensor = 0.0

use_beam_hyps: bool = True

class tensorrt_llm.runtime.Session(**kwargs)[source]

Bases: object

Session is a managed TensorRT runtime.

property context: IExecutionContext

Get the default TensorRT execution context,: use self.engine.create_execution_context() to create a new context if needed

@return: one TensorRT execution context object

Type:: @brief

property engine: ICudaEngine

static from_engine(engine) → Session[source]: @brief: Create a session from an existing ICudaEngine engine @param engine: an ICudaEngine @return: a Session object

static from_serialized_engine(engine) → Session[source]: @brief: Create a session from a serialized engine @param engine: a serialized engine @return: a Session object

infer_shapes(inputs: List[TensorInfo], context=None) → List[TensorInfo][source]

@brief: Set input shapes to given context, and infer the output shapes from the given input shapes.: This function should be called every time when the input shapes are changed before calling run(). Or call the context.set_input_shape on all dynamic shaped input tensors manually.

@param inputs: list of TensorInfo object, each item represents an input tensor @param context: TensorRT execution context, if None, use the default context @return: list of TensorInfo object, each item represents an output tensor, returns None if failed

run(inputs: Dict[str, Any], outputs: Dict[str, Any], stream, context=None) → bool[source]: @brief: Run the TensorRT engine with the given inputs and outputs @param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor @param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor @param stream: cuda stream to enqueue the TensorRT engine on @param context: TensorRT execution context, if None, use the default context @return: True if enqueue succeeded, note the enqueue is an async call,

returning True does not mean the execution is finished

property runtime: Runtime

class tensorrt_llm.runtime.TensorInfo(name: 'str', dtype: 'trt.DataType', shape: 'tuple')[source]

Bases: object

dtype: DataType

name: str

shape: tuple

tensorrt_llm.runtime.to_word_list_format(word_dict: List[List[str]], tokenizer=None)[source]

format of word_dict: len(word_dict) should be same to batch_size word_dict[i] means the words for batch i len(word_dict[i]) must be 1, which means it only contains 1 string This string can contains several sentences and split by “,”. For example, if word_dict[2] = ” I am happy, I am sad”, then this function will return the ids for two short sentences ” I am happy” and ” I am sad”.