Runtime

class tensorrt_llm.runtime.ChatGLMGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]: Bases: GenerationSession

class tensorrt_llm.runtime.GenerationSequence(seq_idx, batch_idx)[source]

Bases: object

get_batch_idx() → int[source]: Returns idx of sequence in batch

get_seq_idx() → int[source]: Returns sequence idx

class tensorrt_llm.runtime.GenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None)[source]

Bases: object

batch_size: int

buffer_allocated: bool

property cross_attention

cuda_graph_mode: bool

cuda_stream_guard()[source]: Sync external stream and set current stream to the one bound to the session. Reset on exit.

debug_mode: bool

debug_tensors_to_save: None

decode(input_ids: Tensor, context_lengths: Tensor, sampling_config: SamplingConfig, prompt_embedding_table: Tensor = None, tasks: Tensor = None, prompt_vocab_size: Tensor = None, stop_words_list=None, bad_words_list=None, no_repeat_ngram_size=None, streaming: bool = False, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor = None, encoder_input_lengths: Tensor = None, **kwargs)[source]

decode_batch(input_ids: Sequence[Tensor], sampling_config: SamplingConfig, streaming: bool = False, **kwargs)[source]

decode_regular(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_list, bad_words_list, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None, **kwargs)[source]

decode_stream(batch_size: int, scfg: SamplingConfig, sequence_lengths: Tensor, context_lengths: Tensor, host_context_lengths, max_context_length: int, beam_width: int, cache_indirections: list, input_ids: Tensor, hidden_states: Tensor, prompt_embedding_table: Tensor, tasks: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, stop_words_list, bad_words_list, no_repeat_ngram_size, output_sequence_lengths: bool = False, return_dict: bool = False, encoder_output: Tensor | None = None, encoder_input_lengths: Tensor | None = None, **kwargs)[source]

device: device

property dtype

finalize_decoder(context_lengths, batch_size, beam_width, scfg, streaming=False)[source]

property first_layer

property gather_all_token_logits

handle_per_step(cache_indirections: list, step: int, batch_size: int, max_context_length: int, beam_width: int, input_ids: Tensor, hidden_states: Tensor, scfg: SamplingConfig, kv_cache_block_pointers: list, host_kv_cache_block_pointers: list, prompt_embedding_table: Tensor, tasks: Tensor, context_lengths: Tensor, host_context_lengths, attention_mask: Tensor, prompt_vocab_size: Tensor, ite: int, sequence_limit_lengths: Tensor, sequence_lengths: Tensor, next_step_buffer: dict, stop_words_list, bad_words_list, no_repeat_ngram_size, encoder_output: Tensor, encoder_input_lengths: Tensor)[source]

property has_position_embedding

property has_token_type_embedding

property head_size

property hidden_size

property last_layer

mapping: Mapping

property max_prompt_embedding_table_size

property num_heads

property num_heads_kv

property num_layers

property paged_kv_cache

pp_communicate_final_output_ids(final_output_ids, batch_size, beam_width)[source]

pp_communicate_new_tokens(should_stop, cache_indir, sequence_length)[source]

property quant_mode

property remove_input_padding

runtime: _Runtime

setup(batch_size: int, max_context_length: int, max_new_tokens: int, beam_width: int = 1, max_kv_cache_length: int | None = None, encoder_max_input_length: int | None = None, lora_manager: LoraManager | None = None, lora_uids: List[str] | None = None)[source]

property tokens_per_block

property use_custom_all_reduce

property use_gpt_attention_plugin

property use_lora_plugin

property vocab_size

class tensorrt_llm.runtime.KVCacheManager(memory_pools: List[Tensor], blocks: int, tokens_per_block: int, max_blocks_per_seq: int, max_kv_cache_len: int, beam_width: int = 1)[source]

Bases: object

add_sequence(sequence: GenerationSequence, context_len: int)[source]: Add sequence to the manager and allocate minimum amount of blocks for context

get_pointer_arrays(beam_width: int) → List[Tensor][source]: Returns arrays of pointers for all memory pools

step(finished: List[bool])[source]: Iterate to the next generation step. Add new blocks where needed and clear finished sequences.

class tensorrt_llm.runtime.ModelConfig(vocab_size: int, num_layers: int, num_heads: int, num_kv_heads: int, hidden_size: int, gpt_attention_plugin: bool, remove_input_padding: bool = False, model_name: str = '', paged_kv_cache: bool = False, cross_attention: bool = False, head_size: int = None, has_position_embedding: bool = True, has_token_type_embedding: bool = False, tokens_per_block: int = 64, max_prompt_embedding_table_size: int = 0, quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode.0: 0>, gather_all_token_logits: bool = False, dtype: str = '', use_custom_all_reduce: bool = False, lora_plugin: bool = False, lora_target_modules: List[str] = <factory>)[source]

Bases: object

cross_attention: bool = False

dtype: str = ''

gather_all_token_logits: bool = False

gpt_attention_plugin: bool

has_position_embedding: bool = True

has_token_type_embedding: bool = False

head_size: int = None

hidden_size: int

lora_plugin: bool = False

lora_target_modules: List[str]

max_prompt_embedding_table_size: int = 0

model_name: str = ''

num_heads: int

num_kv_heads: int

num_layers: int

paged_kv_cache: bool = False

quant_mode: QuantMode = 0

remove_input_padding: bool = False

tokens_per_block: int = 64

use_custom_all_reduce: bool = False

vocab_size: int

class tensorrt_llm.runtime.ModelRunner(session: GenerationSession, max_batch_size: int, max_input_len: int, lora_manager: LoraManager | None = None)[source]

Bases: object

An interface class that wraps GenerationSession and provides generation methods.

classmethod from_dir(engine_dir: str, lora_dir: str | None = None, rank: int = 0, debug_mode: bool = False) → ModelRunner[source]

Create a ModelRunner instance from an engine directory.

Parameters:

engine_dir (str) – The directory that contains the serialized engine files and config files.
lora_dir (str) – The directory that contains LoRA weights.
rank (int) – The runtime rank id.
debug_mode (int) – Whether or not to turn on the debug mode.

Returns:

An instance of ModelRunner.

Return type:

ModelRunner

Generates sequences of token ids. The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed. You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:

batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).
sampling_config (Optional[SamplingConfig]) – The sampling configuration to be used as base parametrization for the generation call. The passed **kwargs matching the sampling_config’s attributes will override them. If the sampling_config is not provided, a default will be used.
prompt_table_path (str) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py).
prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).
lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.
(Dict[str (kwargs) – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.
Any] – Ad hoc parametrization of sampling_config. The passed **kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids. If return_dict=True, the method returns a dict of output_ids, sequence_lengths (if sampling_config.output_sequence_lengths=True), context_logits and generation_logits (if self.session.gather_all_token_logits=True).

Return type:

torch.Tensor or dict

property max_prompt_embedding_table_size: int

property remove_input_padding: bool

property use_lora_plugin: bool

class tensorrt_llm.runtime.QWenForCausalLMGenerationSession(model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream | None = None, global_max_input_length: int = 2048, global_max_output_length: int = 4096)[source]

Bases: GenerationSession

generate(input_ids: Tensor, input_lengths: Tensor, sampling_config: SamplingConfig, max_new_tokens: int, runtime_rank: int = 0)[source]

class tensorrt_llm.runtime.Session(**kwargs)[source]

Bases: object

Session is a managed TensorRT runtime.

property context: IExecutionContext

Get the default TensorRT execution context,: use self.engine.create_execution_context() to create a new context if needed

@return: one TensorRT execution context object

Type:: @brief

property engine: ICudaEngine

static from_engine(engine) → Session[source]: @brief: Create a session from an existing ICudaEngine engine @param engine: an ICudaEngine @return: a Session object

static from_serialized_engine(engine) → Session[source]: @brief: Create a session from a serialized engine @param engine: a serialized engine @return: a Session object

infer_shapes(inputs: List[TensorInfo], context: IExecutionContext | None = None) → List[TensorInfo][source]

@brief: Set input shapes to given context, and infer the output shapes from the given input shapes.: This function should be called every time when the input shapes are changed before calling run(). Or call the context.set_input_shape on all dynamic shaped input tensors manually.

@param inputs: list of TensorInfo object, each item represents an input tensor @param context: TensorRT execution context, if None, use the default context @return: list of TensorInfo object, each item represents an output tensor, returns None if failed

run(inputs: Dict[str, Any], outputs: Dict[str, Any], stream, context=None) → bool[source]: @brief: Run the TensorRT engine with the given inputs and outputs @param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor @param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor @param stream: cuda stream to enqueue the TensorRT engine on @param context: TensorRT execution context, if None, use the default context @return: True if enqueue succeeded, note the enqueue is an async call,

returning True does not mean the execution is finished

property runtime: Runtime

set_shapes(tensor_dict: Dict[str, Tensor], context: IExecutionContext | None = None)[source]

class tensorrt_llm.runtime.TensorInfo(name: 'str', dtype: 'trt.DataType', shape: 'tuple')[source]

Bases: object

dtype: DataType

name: str

shape: tuple

tensorrt_llm.runtime.to_word_list_format(word_dict: List[List[str]], tokenizer=None, add_special_tokens=False)[source]

format of word_dict: len(word_dict) should be same to batch_size word_dict[i] means the words for batch i len(word_dict[i]) must be 1, which means it only contains 1 string This string can contains several sentences and split by “,”. For example, if word_dict[2] = “ I am happy, I am sad”, then this function will return the ids for two short sentences “ I am happy” and “ I am sad”.