From 522f912bf772e1505847084657497465aca814b9 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Tue, 1 Jul 2025 09:49:04 +0000
Subject: [PATCH] Update latest GitHub pages to v1.0.0rc1
---
latest/.buildinfo | 2 +-
latest/_cpp_gen/executor.html | 75 +++-
latest/_cpp_gen/runtime.html | 135 +++----
.../attention.py | 226 ++++++++++-
.../llm_args.py | 156 ++------
latest/_modules/index.html | 8 +-
latest/_modules/tensorrt_llm/builder.html | 8 +-
.../tensorrt_llm/disaggregated_params.html | 8 +-
.../tensorrt_llm/executor/result.html | 13 +-
.../_modules/tensorrt_llm/executor/utils.html | 8 +-
latest/_modules/tensorrt_llm/functional.html | 8 +-
.../tensorrt_llm/layers/activation.html | 8 +-
.../tensorrt_llm/layers/attention.html | 8 +-
latest/_modules/tensorrt_llm/layers/cast.html | 8 +-
latest/_modules/tensorrt_llm/layers/conv.html | 8 +-
.../tensorrt_llm/layers/embedding.html | 8 +-
.../_modules/tensorrt_llm/layers/linear.html | 8 +-
latest/_modules/tensorrt_llm/layers/mlp.html | 8 +-
.../tensorrt_llm/layers/normalization.html | 8 +-
.../_modules/tensorrt_llm/layers/pooling.html | 8 +-
.../tensorrt_llm/llmapi/build_cache.html | 8 +-
latest/_modules/tensorrt_llm/llmapi/llm.html | 355 +++++++++++-------
.../tensorrt_llm/llmapi/llm_args.html | 164 ++------
.../tensorrt_llm/llmapi/mpi_session.html | 8 +-
.../tensorrt_llm/models/baichuan/model.html | 8 +-
.../tensorrt_llm/models/bert/model.html | 8 +-
.../tensorrt_llm/models/bloom/model.html | 8 +-
.../tensorrt_llm/models/chatglm/config.html | 8 +-
.../tensorrt_llm/models/chatglm/model.html | 8 +-
.../tensorrt_llm/models/clip/model.html | 8 +-
.../tensorrt_llm/models/cogvlm/config.html | 8 +-
.../tensorrt_llm/models/cogvlm/model.html | 8 +-
.../tensorrt_llm/models/commandr/model.html | 8 +-
.../tensorrt_llm/models/dbrx/config.html | 8 +-
.../tensorrt_llm/models/dbrx/model.html | 8 +-
.../models/deepseek_v1/model.html | 8 +-
.../models/deepseek_v2/model.html | 8 +-
.../tensorrt_llm/models/dit/model.html | 8 +-
.../tensorrt_llm/models/eagle/model.html | 8 +-
.../tensorrt_llm/models/enc_dec/model.html | 8 +-
.../tensorrt_llm/models/falcon/config.html | 8 +-
.../tensorrt_llm/models/falcon/model.html | 8 +-
.../tensorrt_llm/models/gemma/config.html | 8 +-
.../tensorrt_llm/models/gemma/model.html | 8 +-
.../tensorrt_llm/models/gpt/config.html | 8 +-
.../tensorrt_llm/models/gpt/model.html | 8 +-
.../tensorrt_llm/models/gptj/config.html | 8 +-
.../tensorrt_llm/models/gptj/model.html | 8 +-
.../tensorrt_llm/models/gptneox/model.html | 8 +-
.../tensorrt_llm/models/llama/config.html | 8 +-
.../tensorrt_llm/models/llama/model.html | 8 +-
.../tensorrt_llm/models/mamba/model.html | 8 +-
.../tensorrt_llm/models/medusa/config.html | 8 +-
.../tensorrt_llm/models/medusa/model.html | 8 +-
.../tensorrt_llm/models/mllama/model.html | 8 +-
.../tensorrt_llm/models/mmdit_sd3/model.html | 8 +-
.../tensorrt_llm/models/modeling_utils.html | 8 +-
.../tensorrt_llm/models/mpt/model.html | 8 +-
.../models/multimodal_encoders/config.html | 8 +-
.../models/multimodal_encoders/model.html | 8 +-
.../tensorrt_llm/models/opt/model.html | 8 +-
.../tensorrt_llm/models/phi/model.html | 8 +-
.../tensorrt_llm/models/phi3/model.html | 8 +-
.../models/recurrentgemma/model.html | 8 +-
.../tensorrt_llm/models/redrafter/model.html | 47 ++-
.../_modules/tensorrt_llm/plugin/plugin.html | 8 +-
.../tensorrt_llm/quantization/mode.html | 8 +-
.../quantization/quantize_by_modelopt.html | 8 +-
.../runtime/enc_dec_model_runner.html | 8 +-
.../tensorrt_llm/runtime/generation.html | 8 +-
.../runtime/kv_cache_manager.html | 8 +-
.../tensorrt_llm/runtime/model_runner.html | 8 +-
.../runtime/model_runner_cpp.html | 8 +-
.../runtime/multimodal_model_runner.html | 8 +-
.../tensorrt_llm/runtime/session.html | 8 +-
.../tensorrt_llm/sampling_params.html | 10 +-
latest/advanced/disaggregated-service.html | 8 +-
latest/advanced/executor.html | 8 +-
latest/advanced/expert-parallelism.html | 8 +-
latest/advanced/gpt-attention.html | 8 +-
latest/advanced/gpt-runtime.html | 8 +-
latest/advanced/graph-rewriting.html | 8 +-
latest/advanced/kv-cache-management.html | 8 +-
latest/advanced/kv-cache-reuse.html | 8 +-
latest/advanced/lora.html | 8 +-
.../advanced/lowprecision-pcie-allreduce.html | 8 +-
.../open-sourced-cutlass-kernels.html | 8 +-
latest/advanced/speculative-decoding.html | 8 +-
latest/advanced/weight-streaming.html | 8 +-
latest/architecture/add-model.html | 8 +-
latest/architecture/checkpoint.html | 8 +-
latest/architecture/core-concepts.html | 8 +-
latest/architecture/model-weights-loader.html | 8 +-
latest/architecture/overview.html | 8 +-
latest/architecture/workflow.html | 8 +-
...actice_on_DeepSeek-R1_in_TensorRT-LLM.html | 8 +-
latest/blogs/Falcon180B-H200.html | 8 +-
latest/blogs/H100vsA100.html | 8 +-
latest/blogs/H200launch.html | 8 +-
latest/blogs/XQA-kernel.html | 8 +-
latest/blogs/quantization-in-TRT-LLM.html | 8 +-
...ek-R1_Performance_on_NVIDIA_B200_GPUs.html | 8 +-
...1_MTP_Implementation_and_Optimization.html | 8 +-
...1_Throughput_on_NVIDIA_Blackwell_GPUs.html | 8 +-
...ng_Expert_Parallelism_in_TensorRT-LLM.html | 8 +-
...Disaggregated_Serving_in_TensorRT-LLM.html | 8 +-
latest/commands/trtllm-build.html | 8 +-
latest/commands/trtllm-serve.html | 8 +-
.../build-image-to-dockerhub.html | 8 +-
latest/dev-on-cloud/dev-on-runpod.html | 8 +-
latest/examples/curl_chat_client.html | 8 +-
.../curl_chat_client_for_multimodal.html | 8 +-
latest/examples/curl_completion_client.html | 8 +-
latest/examples/customization.html | 8 +-
.../deepseek_r1_reasoning_parser.html | 8 +-
latest/examples/genai_perf_client.html | 8 +-
.../genai_perf_client_for_multimodal.html | 8 +-
latest/examples/index.html | 8 +-
latest/examples/llm_api_examples.html | 8 +-
latest/examples/llm_auto_parallel.html | 8 +-
latest/examples/llm_eagle2_decoding.html | 8 +-
latest/examples/llm_eagle_decoding.html | 8 +-
latest/examples/llm_guided_decoding.html | 8 +-
latest/examples/llm_inference.html | 8 +-
latest/examples/llm_inference_async.html | 8 +-
.../llm_inference_async_streaming.html | 8 +-
latest/examples/llm_inference_customize.html | 8 +-
.../examples/llm_inference_distributed.html | 8 +-
latest/examples/llm_inference_kv_events.html | 8 +-
latest/examples/llm_logits_processor.html | 8 +-
latest/examples/llm_lookahead_decoding.html | 8 +-
latest/examples/llm_medusa_decoding.html | 8 +-
latest/examples/llm_mgmn_llm_distributed.html | 8 +-
latest/examples/llm_mgmn_trtllm_bench.html | 8 +-
latest/examples/llm_mgmn_trtllm_serve.html | 8 +-
latest/examples/llm_multilora.html | 8 +-
latest/examples/llm_quantization.html | 8 +-
latest/examples/openai_chat_client.html | 8 +-
.../openai_chat_client_for_multimodal.html | 8 +-
latest/examples/openai_completion_client.html | 8 +-
latest/examples/trtllm_serve_examples.html | 8 +-
latest/genindex.html | 74 ++--
latest/index.html | 8 +-
.../installation/build-from-source-linux.html | 8 +-
latest/installation/grace-hopper.html | 8 +-
latest/installation/linux.html | 8 +-
latest/key-features.html | 8 +-
latest/llm-api/index.html | 8 +-
latest/llm-api/reference.html | 74 +++-
latest/objects.inv | Bin 147183 -> 147331 bytes
latest/overview.html | 8 +-
latest/performance/perf-analysis.html | 8 +-
latest/performance/perf-benchmarking.html | 8 +-
latest/performance/perf-overview.html | 8 +-
.../benchmarking-default-performance.html | 8 +-
.../deciding-model-sharding-strategy.html | 8 +-
.../fp8-quantization.html | 8 +-
.../performance-tuning-guide/index.html | 8 +-
...ing-max-batch-size-and-max-num-tokens.html | 8 +-
.../useful-build-time-flags.html | 8 +-
.../useful-runtime-flags.html | 8 +-
latest/py-modindex.html | 8 +-
.../python-api/tensorrt_llm.functional.html | 8 +-
latest/python-api/tensorrt_llm.layers.html | 8 +-
latest/python-api/tensorrt_llm.models.html | 65 ++--
latest/python-api/tensorrt_llm.plugin.html | 8 +-
.../python-api/tensorrt_llm.quantization.html | 8 +-
latest/python-api/tensorrt_llm.runtime.html | 8 +-
latest/quick-start-guide.html | 8 +-
latest/reference/ci-overview.html | 8 +-
latest/reference/memory.html | 8 +-
latest/reference/precision.html | 8 +-
latest/reference/support-matrix.html | 8 +-
latest/reference/troubleshooting.html | 8 +-
latest/release-notes.html | 8 +-
latest/scripts/disaggregated/README.html | 8 +-
latest/search.html | 8 +-
latest/searchindex.js | 2 +-
latest/torch.html | 8 +-
latest/torch/adding_new_model.html | 8 +-
latest/torch/arch_overview.html | 8 +-
latest/torch/attention.html | 8 +-
latest/torch/kv_cache_manager.html | 8 +-
latest/torch/scheduler.html | 8 +-
184 files changed, 1447 insertions(+), 1303 deletions(-)
diff --git a/latest/.buildinfo b/latest/.buildinfo
index 6659c65362..fff48eff5a 100644
--- a/latest/.buildinfo
+++ b/latest/.buildinfo
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 5dd2b8f29ac03c9c53f8ad8ba1fb6dcc
+config: 812228e223c943ca4d4a375a1c33a00f
tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html
index 451a72bcc0..aeaf7f62bd 100644
--- a/latest/_cpp_gen/executor.html
+++ b/latest/_cpp_gen/executor.html
@@ -51,7 +51,7 @@
@@ -63,7 +63,7 @@
-
+
@@ -3007,10 +3007,10 @@
Private Functions
--
-void forwardDispatch(
+
-
+void forwardDispatch(
- decoder::DecoderState const &decoderState,
-- decoder_batch::Output &output,
- decoder_batch::Input const &input,
-)
+)
Calls decoders for tokens per engine step.
@@ -10992,7 +10951,23 @@ one more than decoding draft tokens for prediction from primary head
)
-
+
Setup buffers for the decoder excluding speculative decoding.
+
+
+
+-
+void setupCacheIndirection(
+
+
+- SizeType32 maxBatchSize,
+- SizeType32 maxBeamWidth,
+- SizeType32 maxAttentionWindow,
+
+
+)
+Setup buffers for the cache indirection.
+This is used for beam search on pipeline parallel ranks without a decoder.
+
-
@@ -11332,6 +11307,18 @@ one more than decoding draft tokens for prediction from primary head
Workspace for beam search in streaming mode.
+
+-
+TensorPtr getCacheIndirectionInput() const
+Cache indirection input for beam search.
+
+
+
+-
+TensorPtr getCacheIndirectionOutput() const
+Cache indirection output for beam search.
+
+
-
DecodingInput &getJointDecodingInput() const
@@ -13262,8 +13249,8 @@ one more than decoding draft tokens for prediction from primary head
TensorPtr
setup()
disableLookahead()
-
forwardAsync()
-
forward()
+
forwardAsync()
+
forward()
finalize()
IGptDecoderBatched()
~IGptDecoderBatched()
@@ -13280,7 +13267,6 @@ one more than decoding draft tokens for prediction from primary head
maxDecoderSteps
batchSlots
batchSlotsRequestOrder
-
cacheIndirection
generationSteps
predictedDraftLogits
explicitDraftTokensInputs
@@ -13289,12 +13275,6 @@ one more than decoding draft tokens for prediction from primary head
eagleLastInputs
-
tensorrt_llm::runtime::decoder_batch::Output
-
@@ -13331,14 +13311,14 @@ one more than decoding draft tokens for prediction from primary head
GptDecoderBatched()
setup()
disableLookahead()
-
forwardAsync()
-
forward()
+
forwardAsync()
+
forward()
finalize()
getDecoderStream()
getUnderlyingDecoder()
getBufferManager()
GptDecoderPtr
-
forwardDispatch()
+
forwardDispatch()
mRuntimeStream
mDecoderStream
mBufferManager
@@ -13492,6 +13472,7 @@ one more than decoding draft tokens for prediction from primary head
DecoderState()
allocateSpeculativeDecodingBuffers()
setup()
+
setupCacheIndirection()
setupSpeculativeDecoding()
disableLookahead()
getFinishedSum()
@@ -13527,6 +13508,8 @@ one more than decoding draft tokens for prediction from primary head
getEagleBuffers()
getLookaheadBuffers()
getBeamSearchBuffers()
+
getCacheIndirectionInput()
+
getCacheIndirectionOutput()
getJointDecodingInput()
getJointDecodingOutput()
mMaxBatchSize
@@ -13706,9 +13689,9 @@ one more than decoding draft tokens for prediction from primary head
diff --git a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
index df03e74186..09671696aa 100644
--- a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
+++ b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
@@ -5,6 +5,7 @@ from typing import Optional, Union, cast
import torch
from torch import nn
+from tensorrt_llm._utils import get_sm_version
from tensorrt_llm.logger import logger
from tensorrt_llm.mapping import Mapping
@@ -346,6 +347,47 @@ def mla_custom_op_inplace(
mla_layer.forward_impl(position_ids, hidden_states, metadata, output=output)
+def fp8_block_scaling_bmm_out(
+ mat1: torch.Tensor,
+ mat2_fp8: torch.Tensor,
+ mat2_scale: torch.Tensor,
+ out: torch.Tensor,
+) -> torch.Tensor:
+ sm_version = get_sm_version()
+ if sm_version == 90:
+ mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
+ mat1)
+ torch.ops.trtllm.fp8_block_scaling_bmm_out(mat1_fp8, mat2_fp8,
+ mat1_scale, mat2_scale, out)
+ elif sm_version == 100:
+ low_latency = True
+ use_deep_seek_fp8 = True
+ tile_size = 8
+ epilogue_tile_m = 64 if use_deep_seek_fp8 else 128
+ m_size = mat1.shape[0]
+ if m_size % tile_size != 0:
+ tiled_shape = ((m_size + tile_size - 1) // tile_size) * tile_size
+ mat1 = torch.nn.functional.pad(
+ mat1, (0, 0, 0, 0, 0, tiled_shape - m_size), "constant", 0)
+
+ mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
+ mat1)
+ output, output_sf = torch.ops.trtllm.fp8_batched_gemm_trtllmgen(
+ mat1_fp8,
+ mat2_fp8,
+ tile_size=tile_size,
+ epilogue_tile_m=epilogue_tile_m,
+ use_deep_seek_fp8=use_deep_seek_fp8,
+ low_latency=low_latency,
+ dq_sfs_a=mat1_scale.reshape(mat1.shape[-1] // 128, -1),
+ dq_sfs_b=mat2_scale,
+ out_dtype=out.dtype,
+ )
+ out.copy_(output[:, :m_size])
+ else:
+ raise NotImplementedError(f"SM{sm_version} is not supported")
+
+
class MLA(nn.Module):
def __init__(
@@ -922,6 +964,166 @@ class MLA(nn.Module):
return attn_output
+ def forward_context_with_chunked_prefill(
+ self,
+ q: torch.Tensor,
+ compressed_kv: torch.Tensor,
+ latent_cache: torch.
+ Tensor, # compressed_kv + k_pe [context_tokens, 1, lora_size + rope_size]
+ attn_metadata: TrtllmAttentionMetadata,
+ output: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ trtllm_attention = cast(TrtllmAttention, self.mha)
+ # apply RoPE, append compressed_kv + k_pe to paged kv cache and assign q_pe to q
+ trtllm_attention.mla_rope_append_paged_kv_assign_q(
+ q, latent_cache, attn_metadata)
+
+ # determine the number of loop
+ # currently we assume that the chunk size is the same as the max_num_tokens
+ chunk_size = attn_metadata.runtime_features.chunk_size
+ chunked_loop_num = attn_metadata.chunked_loop_num
+
+ # [toal_token_q, num_heads, 2] -> [toal_token_q, num_heads] float2
+ self.softmax_stats_tensor = torch.empty(
+ (attn_metadata.num_ctx_tokens, self.num_heads, 2),
+ dtype=torch.float,
+ device='cuda',
+ )
+ self.temp_softmax_stats_tensor = torch.empty(
+ (attn_metadata.num_ctx_tokens, self.num_heads, 2),
+ dtype=torch.float,
+ device='cuda',
+ )
+ if output is None:
+ attn_output = q.new_empty(
+ (q.size(0), self.num_heads * self.v_head_dim), dtype=q.dtype)
+ else:
+ attn_output = output
+ temp_attn_output = q.new_empty(
+ (q.size(0), self.num_heads * self.v_head_dim), dtype=q.dtype)
+
+ # use fake cached_cu_seq_len for chunked loop
+ origin_kv_lens_cuda_runtime = attn_metadata.kv_lens_cuda_runtime
+ origin_kv_lens_runtime = attn_metadata.kv_lens_runtime
+
+ for loop_idx in range(chunked_loop_num):
+ # {b, chunked_unit_size, h, kv_lora_rank + qk_rope_head_dim} zero padded
+ # fetch `loop_idx` chunk from kv cache
+ temp_cu_chunked_seq_len = attn_metadata.cu_chunked_seq_len[loop_idx]
+ total_ctx_chunked_tokens = attn_metadata.host_cu_chunked_seq_len[
+ loop_idx, attn_metadata.num_contexts]
+ chunked_compressed_kv, chunked_k_pe = trtllm_attention.load_chunked_kv_cache_for_mla(
+ metadata=attn_metadata,
+ chunked_idx=loop_idx,
+ num_ctx_cached_tokens=total_ctx_chunked_tokens,
+ cu_chunked_seq_len=temp_cu_chunked_seq_len,
+ out_dtype=q.dtype)
+
+ # up proj to uncompressed kv
+ # [tokens, 2, h, kv_dim], without rope_dim
+ chunked_kv = self.kv_b_proj(chunked_compressed_kv)
+
+ # build full_kv
+ # full_kv {B, 2, chunk_size / tokens_per_block, h, tokens_per_block, kv_dim + rope_dim}
+ tokens_per_block = attn_metadata.kv_cache_manager.tokens_per_block
+ full_kv = torch.zeros([
+ attn_metadata.num_contexts, 2,
+ (chunk_size + tokens_per_block - 1) // tokens_per_block,
+ self.num_heads, tokens_per_block,
+ max(self.qk_nope_head_dim + self.qk_rope_head_dim,
+ self.v_head_dim)
+ ],
+ dtype=q.dtype,
+ device=q.device)
+ mla_kv_cache_block_offsets = trtllm_attention.set_chunked_kv_cache_for_mla(
+ full_kv,
+ chunked_kv,
+ chunked_k_pe,
+ cu_chunked_seq_len=temp_cu_chunked_seq_len,
+ cached=True,
+ metadata=attn_metadata)
+
+ # copy chunked_seq_len to replace kv_lens_runtime
+ attn_metadata.kv_lens_runtime = attn_metadata.host_chunked_seq_len[
+ loop_idx]
+ attn_metadata.kv_lens_cuda_runtime = attn_metadata.chunked_seq_len[
+ loop_idx]
+ out_scale = None
+ # do not apply mask for attention within loop
+ temp_attn_output = self.mha.forward(
+ q,
+ None,
+ None,
+ attn_metadata,
+ attention_input_type=AttentionInputType.context_only,
+ latent_cache=None,
+ out_scale=out_scale,
+ attention_mask=PredefinedAttentionMask.FULL,
+ mla_context_paged_kv=full_kv,
+ mla_context_kv_cache_block_offsets=mla_kv_cache_block_offsets,
+ softmax_stats_tensor=self.temp_softmax_stats_tensor,
+ output=temp_attn_output,
+ )
+ # merge attn result
+ temp_merge_op = attn_metadata.merge_op_tensor[loop_idx]
+ trtllm_attention.merge_attention_for_mla(
+ attn_output, temp_attn_output, self.softmax_stats_tensor,
+ self.temp_softmax_stats_tensor, temp_merge_op, attn_metadata)
+
+ # deal with the uncached kv
+ kv = self.kv_b_proj(compressed_kv)
+ _, k_pe = latent_cache.view([
+ -1, self.kv_lora_rank + self.qk_rope_head_dim
+ ]).split([self.kv_lora_rank, self.qk_rope_head_dim], -1)
+ k_pe = k_pe.contiguous()
+ # final round of attention
+
+ # out_scale = getattr(self.o_proj, "inv_input_scale", None)
+ out_scale = None # Currently we use BF16 MHA for context phase
+
+ tokens_per_block = attn_metadata.kv_cache_manager.tokens_per_block
+ full_kv = torch.zeros([
+ attn_metadata.num_contexts, 2,
+ (attn_metadata.max_ctx_seq_len + tokens_per_block - 1) //
+ tokens_per_block, self.num_heads, tokens_per_block,
+ max(self.qk_nope_head_dim + self.qk_rope_head_dim, self.v_head_dim)
+ ],
+ dtype=q.dtype,
+ device=q.device)
+ mla_kv_cache_block_offsets = trtllm_attention.set_chunked_kv_cache_for_mla(
+ full_kv,
+ kv,
+ k_pe,
+ cu_chunked_seq_len=None,
+ cached=False,
+ metadata=attn_metadata)
+ # copy q_lens to replace kv_lens_runtime
+ attn_metadata.kv_lens_runtime = attn_metadata.prompt_lens_cpu_runtime
+ attn_metadata.kv_lens_cuda_runtime = attn_metadata.prompt_lens_cuda_runtime
+ temp_attn_output = self.mha.forward(
+ q,
+ None,
+ None,
+ attn_metadata,
+ attention_input_type=AttentionInputType.context_only,
+ latent_cache=None,
+ out_scale=out_scale,
+ mla_context_paged_kv=full_kv,
+ mla_context_kv_cache_block_offsets=mla_kv_cache_block_offsets,
+ softmax_stats_tensor=self.temp_softmax_stats_tensor,
+ output=temp_attn_output,
+ )
+ temp_merge_op = attn_metadata.merge_op_tensor[chunked_loop_num]
+ trtllm_attention.merge_attention_for_mla(attn_output, temp_attn_output,
+ self.softmax_stats_tensor,
+ self.temp_softmax_stats_tensor,
+ temp_merge_op, attn_metadata)
+ # copy back kv_lens_runtime and kv_lens_cuda_runtime
+ attn_metadata.kv_lens_runtime = origin_kv_lens_runtime
+ attn_metadata.kv_lens_cuda_runtime = origin_kv_lens_cuda_runtime
+
+ return attn_output
+
def forward_context(
self,
q: torch.Tensor,
@@ -934,7 +1136,11 @@ class MLA(nn.Module):
if isinstance(self.mha, TrtllmAttention):
assert isinstance(attn_metadata, TrtllmAttentionMetadata)
trtllm_attention = cast(TrtllmAttention, self.mha)
- if trtllm_attention.has_cached_kv_for_mla_context(attn_metadata):
+ if trtllm_attention.is_chunked_prefill_for_mla_context(
+ attn_metadata):
+ return self.forward_context_with_chunked_prefill(
+ q, compressed_kv, latent_cache, attn_metadata, output)
+ elif trtllm_attention.has_cached_kv_for_mla_context(attn_metadata):
return self.forward_context_with_cached_kv(
q, latent_cache, attn_metadata, output)
return self.forward_context_default(q, compressed_kv, k_pe,
@@ -976,15 +1182,11 @@ class MLA(nn.Module):
self.k_b_proj_trans.transpose(1, 2),
q_nope_out)
elif self.k_b_proj_trans.dtype == torch.float8_e4m3fn:
- q_nope_fp8, q_nope_scales = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
- q_nope)
# [num_heads, num_tokens, self.kv_lora_rank]
q_nope_out = fused_q[..., :self.kv_lora_rank].transpose(0, 1)
- torch.ops.trtllm.fp8_block_scaling_bmm_out(
- q_nope_fp8, self.k_b_proj_trans, q_nope_scales,
- self.k_b_proj_trans_scale, q_nope_out)
- q_nope_scales = None
+ fp8_block_scaling_bmm_out(q_nope, self.k_b_proj_trans,
+ self.k_b_proj_trans_scale, q_nope_out)
else:
raise NotImplementedError(
f"Missing bmm impl for dtype: {self.k_b_proj_trans.dtype}.")
@@ -1033,13 +1235,9 @@ class MLA(nn.Module):
self.v_b_proj.transpose(1, 2),
attn_output.transpose(0, 1))
elif self.v_b_proj.dtype == torch.float8_e4m3fn:
- attn_out_latent, attn_out_latent_scales = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
- attn_out_latent)
-
- torch.ops.trtllm.fp8_block_scaling_bmm_out(
- attn_out_latent, self.v_b_proj, attn_out_latent_scales,
- self.v_b_proj_scale, attn_output.transpose(0, 1))
- attn_out_latent_scales = None
+ fp8_block_scaling_bmm_out(attn_out_latent, self.v_b_proj,
+ self.v_b_proj_scale,
+ attn_output.transpose(0, 1))
else:
raise NotImplementedError(
f"Missing bmm impl for dtype: {self.v_b_proj.dtype}.")
diff --git a/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py b/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
index 636740d599..affd945635 100644
--- a/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
+++ b/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass, field
from enum import Enum, EnumMeta
from pathlib import Path
from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional,
- Union)
+ TypeAlias, Union)
import torch
import yaml
@@ -54,8 +54,7 @@ from ..models.modeling_utils import (PretrainedConfig, QuantAlgo, QuantConfig,
from ..sampling_params import BatchedLogitsProcessor
from .build_cache import BuildCacheConfig
from .tokenizer import TokenizerBase, tokenizer_factory
-from .utils import (generate_api_docs_as_docstring, get_type_repr,
- print_traceback_on_error)
+from .utils import generate_api_docs_as_docstring, get_type_repr
# TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import
@@ -599,6 +598,16 @@ class LookaheadDecodingConfig(DecodingBaseConfig, PybindMirror):
decoding_type: ClassVar[str] = "Lookahead"
+SpeculativeConfig: TypeAlias = Optional[Union[
+ DraftTargetDecodingConfig,
+ EagleDecodingConfig,
+ LookaheadDecodingConfig,
+ MedusaDecodingConfig,
+ MTPDecodingConfig,
+ NGramDecodingConfig,
+]]
+
+
@PybindMirror.mirror_pybind_fields(_KvCacheConfig)
class KvCacheConfig(BaseModel, PybindMirror):
"""
@@ -658,6 +667,8 @@ class KvCacheConfig(BaseModel, PybindMirror):
description=
"Whether partially matched blocks that are in use can be reused after copying them."
)
+ use_uvm: bool = Field(default=False,
+ description="Whether to use UVM for the KV cache.")
def _to_pybind(self):
return _KvCacheConfig(
@@ -672,7 +683,8 @@ class KvCacheConfig(BaseModel, PybindMirror):
secondary_offload_min_priority=self.secondary_offload_min_priority,
event_buffer_max_size=self.event_buffer_max_size,
enable_partial_reuse=self.enable_partial_reuse,
- copy_on_partial_reuse=self.copy_on_partial_reuse)
+ copy_on_partial_reuse=self.copy_on_partial_reuse,
+ use_uvm=self.use_uvm)
@PybindMirror.mirror_pybind_fields(_ExtendedRuntimePerfKnobConfig)
@@ -879,8 +891,11 @@ class BaseLlmArgs(BaseModel):
enable_chunked_prefill: bool = Field(default=False,
description="Enable chunked prefill.")
- guided_decoding_backend: Optional[str] = Field(
- default=None, description="Guided decoding backend.")
+ guided_decoding_backend: Optional[Literal["xgrammar", "llguidance"]] = Field(
+ default=None,
+ description=
+ "Guided decoding backend. llguidance is supported in PyTorch backend only."
+ )
batched_logits_processor: Optional[object] = Field(
default=None,
@@ -908,11 +923,8 @@ class BaseLlmArgs(BaseModel):
default=None, description="Cache transceiver config.")
# Speculative decoding parameters
- speculative_config: Optional[
- Union[LookaheadDecodingConfig, MedusaDecodingConfig,
- EagleDecodingConfig, MTPDecodingConfig, NGramDecodingConfig,
- DraftTargetDecodingConfig]] = Field(
- default=None, description="Speculative decoding config.")
+ speculative_config: SpeculativeConfig = Field(
+ default=None, description="Speculative decoding config.")
batching_type: Optional[BatchingType] = Field(default=None,
description="Batching type.")
@@ -954,12 +966,6 @@ class BaseLlmArgs(BaseModel):
default=None,
description="The parser to separate reasoning content from output.")
- garbage_collection_gen0_threshold: int = Field(
- default=20000,
- description=
- "Threshold for Python garbage collection of generation 0 objects."
- "Lower values trigger more frequent garbage collection.")
-
# TODO[Superjomn]: To deprecate this config.
decoding_config: Optional[object] = Field(
default=None,
@@ -1621,7 +1627,6 @@ class TorchCompileConfig(BaseModel):
class TorchLlmArgs(BaseLlmArgs):
-
# Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
build_config: Optional[object] = Field(
default=None,
@@ -1631,6 +1636,12 @@ class TorchLlmArgs(BaseLlmArgs):
# PyTorch backend specific configurations
+ garbage_collection_gen0_threshold: int = Field(
+ default=20000,
+ description=
+ "Threshold for Python garbage collection of generation 0 objects."
+ "Lower values trigger more frequent garbage collection.")
+
use_cuda_graph: bool = Field(
default=False,
description=
@@ -1901,115 +1912,6 @@ class TorchLlmArgs(BaseLlmArgs):
return batch_sizes
-class _AutoDeployLlmArgs(TorchLlmArgs):
- """LLM arguments specifically for AutoDeploy backend.
-
- This class extends TorchLlmArgs with AutoDeploy-specific configuration options.
- AutoDeploy provides automatic deployment and optimization of language models
- with various attention backends and optimization strategies.
- """
-
- model_factory: Literal[
- "AutoModelForCausalLM", "AutoModelForImageTextToText"] = Field(
- default="AutoModelForCausalLM",
- description="The model factory to use for loading the model.",
- )
-
- model_kwargs: Dict[str, Any] = Field(
- default_factory=dict,
- description=
- "Extra kwargs for the model config class to customize the model config. "
- "These arguments take precedence over default values or config values in the model config "
- "file. Arguments are resolved in order: 1) Default values in model config class, 2) Values "
- "in model config file, 3) Values in model_kwargs. Note: if a kwarg doesn't exist in the "
- "model config class, it will be ignored.",
- )
-
- mla_backend: Literal["MultiHeadLatentAttention"] = Field(
- default="MultiHeadLatentAttention",
- description="The Multi-Head Latent Attention backend to use.",
- )
-
- skip_loading_weights: bool = Field(
- default=False,
- description=
- "Whether to skip loading model weights during initialization. "
- "If True, only the model architecture is loaded.",
- )
-
- free_mem_ratio: float = Field(
- default=0.8,
- description="The fraction of available memory to allocate for cache. "
- "Must be between 0.0 and 1.0.",
- )
-
- simple_shard_only: bool = Field(
- default=False,
- description=
- "If True, force simple sharding (all_gather) in tensor parallelism. "
- "If False, auto-detect and use column+row (all_reduce) sharding when possible.",
- )
-
- # TODO: Remove this field once tokens_per_block is properly passed through
- attn_page_size: int = Field(
- default=64,
- description=
- "Page size for attention (tokens_per_block). For TritonWithFlattenedInputs "
- "backend, this should equal max_seq_len. Temporary field until tokens_per_block gets "
- "properly passed through.",
- )
-
- checkpoint_device: Optional[str] = Field(
- default=None,
- description="Device on which to load the model checkpoint. "
- "Defaults to the same device as the rest of the pipeline.",
- )
-
- @field_validator("free_mem_ratio")
- @classmethod
- def validate_free_mem_ratio(cls, v):
- """Validate that free_mem_ratio is between 0.0 and 1.0."""
- if not 0.0 <= v <= 1.0:
- raise ValueError(
- f"free_mem_ratio must be between 0.0 and 1.0, got {v}")
- return v
-
- @print_traceback_on_error
- def model_post_init(self, __context):
- # Modify default values that differ from TorchLlmArgs
- new_defaults = {
- "max_batch_size": 8,
- "max_seq_len": 512,
- "attn_backend": "FlashInfer",
- # TODO: Remove this when overlap scheduler is supported (https://github.com/NVIDIA/TensorRT-LLM/issues/4364)
- "disable_overlap_scheduler": True,
- }
- for k, v_default in new_defaults.items():
- if k not in self.__pydantic_fields_set__:
- setattr(self, k, v_default)
-
- # NOTE: Only call super() after setting the default values since default values should be
- # set first.
- super().model_post_init(__context)
-
- # Handle attn_page_size for TritonWithFlattenedInputs backend
- if self.attn_backend == "TritonWithFlattenedInputs":
- self.attn_page_size = self.max_seq_len
-
- # Add max_position_embeddings to model_kwargs
- # TODO (lucaslie): this is more HF specific than a generic model_kwargs. Ideally, we can
- # move this to the HF model factory but we don't have access to max_seq_len there right now.
- self.model_kwargs["max_position_embeddings"] = min(
- self.max_seq_len,
- self.model_kwargs.get("max_position_embeddings", self.max_seq_len),
- )
-
- # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
- def get_pytorch_backend_config(self) -> "_AutoDeployLlmArgs":
- """Return the _AutoDeployLlmArgs (self) object."""
- return self
-
-
def update_llm_args_with_extra_dict(
llm_args: Dict,
llm_args_dict: Dict,
diff --git a/latest/_modules/index.html b/latest/_modules/index.html
index 847207b757..b18fc681fb 100644
--- a/latest/_modules/index.html
+++ b/latest/_modules/index.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -688,9 +688,9 @@
diff --git a/latest/_modules/tensorrt_llm/builder.html b/latest/_modules/tensorrt_llm/builder.html
index 1350157efb..49a5474fef 100644
--- a/latest/_modules/tensorrt_llm/builder.html
+++ b/latest/_modules/tensorrt_llm/builder.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1997,9 +1997,9 @@
diff --git a/latest/_modules/tensorrt_llm/disaggregated_params.html b/latest/_modules/tensorrt_llm/disaggregated_params.html
index 07a1284609..b1a216e2a9 100644
--- a/latest/_modules/tensorrt_llm/disaggregated_params.html
+++ b/latest/_modules/tensorrt_llm/disaggregated_params.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -668,9 +668,9 @@
diff --git a/latest/_modules/tensorrt_llm/executor/result.html b/latest/_modules/tensorrt_llm/executor/result.html
index 7c1aafb70a..7ad298fd7b 100644
--- a/latest/_modules/tensorrt_llm/executor/result.html
+++ b/latest/_modules/tensorrt_llm/executor/result.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -596,6 +596,7 @@
stop_reason (int, str, optional): The stop string or token id that caused the completion to stop, None if the completion finished for some other reason. Defaults to None.
generation_logits (torch.Tensor, optional): The logits on the generated output token ids. Defaults to None.
disaggregated_params (tensorrt_llm.disaggregated_params.DisaggregatedParams, optional): Parameters needed for disaggregated serving. Includes the type of request, the first generated tokens, the context request id and the any additional state needing to be transferred from context and generation instances. Defaults to None.
+
request_perf_metrics (tensorrt_llm.bindings.executor.RequestPerfMetrics, optional): Performance metrics for the request. Defaults to None.
Attributes:
length (int): The number of generated tokens.
@@ -614,6 +615,7 @@
stop_reason: Optional[Union[int, str]] = None
generation_logits: Optional[torch.Tensor] = None
disaggregated_params: Optional[DisaggregatedParams] = None
+
request_perf_metrics: Optional[tllm.RequestPerfMetrics] = None
# hidden fields for tracking the diffs
_last_text_len: int = field(default=0, init=False, repr=False)
@@ -750,6 +752,9 @@
src_idx] == tllm.FinishReason.CANCELLED:
output.finish_reason = 'cancelled'
+
if response_tensors.request_perf_metrics is not None:
+
output.request_perf_metrics = response_tensors.request_perf_metrics
+
if self._done:
if finish_reasons[src_idx] == tllm.FinishReason.END_ID:
output.finish_reason = 'stop'
@@ -1283,9 +1288,9 @@
diff --git a/latest/_modules/tensorrt_llm/executor/utils.html b/latest/_modules/tensorrt_llm/executor/utils.html
index e1e59e12ba..1cc0eb4ab6 100644
--- a/latest/_modules/tensorrt_llm/executor/utils.html
+++ b/latest/_modules/tensorrt_llm/executor/utils.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -791,9 +791,9 @@
diff --git a/latest/_modules/tensorrt_llm/functional.html b/latest/_modules/tensorrt_llm/functional.html
index fb56bca223..78ac19abfb 100644
--- a/latest/_modules/tensorrt_llm/functional.html
+++ b/latest/_modules/tensorrt_llm/functional.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -8722,9 +8722,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/activation.html b/latest/_modules/tensorrt_llm/layers/activation.html
index ebd02d03ee..33e60ea04a 100644
--- a/latest/_modules/tensorrt_llm/layers/activation.html
+++ b/latest/_modules/tensorrt_llm/layers/activation.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -647,9 +647,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/attention.html b/latest/_modules/tensorrt_llm/layers/attention.html
index fb0abd52a3..bb69614c0a 100644
--- a/latest/_modules/tensorrt_llm/layers/attention.html
+++ b/latest/_modules/tensorrt_llm/layers/attention.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -3512,9 +3512,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/cast.html b/latest/_modules/tensorrt_llm/layers/cast.html
index 1adfc7505c..372fbf2e7a 100644
--- a/latest/_modules/tensorrt_llm/layers/cast.html
+++ b/latest/_modules/tensorrt_llm/layers/cast.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -654,9 +654,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/conv.html b/latest/_modules/tensorrt_llm/layers/conv.html
index 10438af5be..7eb8cd46a4 100644
--- a/latest/_modules/tensorrt_llm/layers/conv.html
+++ b/latest/_modules/tensorrt_llm/layers/conv.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -903,9 +903,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/embedding.html b/latest/_modules/tensorrt_llm/layers/embedding.html
index 272bb3fe8c..d3fa3c0c02 100644
--- a/latest/_modules/tensorrt_llm/layers/embedding.html
+++ b/latest/_modules/tensorrt_llm/layers/embedding.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1370,9 +1370,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/linear.html b/latest/_modules/tensorrt_llm/layers/linear.html
index efb972cbb1..8f1c8d4ee2 100644
--- a/latest/_modules/tensorrt_llm/layers/linear.html
+++ b/latest/_modules/tensorrt_llm/layers/linear.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1218,9 +1218,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/mlp.html b/latest/_modules/tensorrt_llm/layers/mlp.html
index 850258bf3f..d39a4197af 100644
--- a/latest/_modules/tensorrt_llm/layers/mlp.html
+++ b/latest/_modules/tensorrt_llm/layers/mlp.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1244,9 +1244,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/normalization.html b/latest/_modules/tensorrt_llm/layers/normalization.html
index 2da79e99e6..ff94345242 100644
--- a/latest/_modules/tensorrt_llm/layers/normalization.html
+++ b/latest/_modules/tensorrt_llm/layers/normalization.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1008,9 +1008,9 @@
diff --git a/latest/_modules/tensorrt_llm/layers/pooling.html b/latest/_modules/tensorrt_llm/layers/pooling.html
index adefd5c408..7fcd914e4b 100644
--- a/latest/_modules/tensorrt_llm/layers/pooling.html
+++ b/latest/_modules/tensorrt_llm/layers/pooling.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -663,9 +663,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/build_cache.html b/latest/_modules/tensorrt_llm/llmapi/build_cache.html
index f226b0dedd..3b4e46cf1f 100644
--- a/latest/_modules/tensorrt_llm/llmapi/build_cache.html
+++ b/latest/_modules/tensorrt_llm/llmapi/build_cache.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -947,9 +947,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/llm.html b/latest/_modules/tensorrt_llm/llmapi/llm.html
index f186c6bbd4..9e6a9a6b12 100644
--- a/latest/_modules/tensorrt_llm/llmapi/llm.html
+++ b/latest/_modules/tensorrt_llm/llmapi/llm.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -522,11 +522,9 @@
from tqdm import tqdm
from transformers import PreTrainedTokenizerBase
-
from tensorrt_llm.builder import BuildConfig
from tensorrt_llm.inputs.data import TextPrompt
from tensorrt_llm.inputs.registry import DefaultInputProcessor
-
from .. import bindings as tllm
from .._utils import nvtx_range_debug
from ..bindings import executor as tllm
from ..builder import EngineConfig
@@ -543,7 +541,7 @@
from ..sampling_params import SamplingParams
from .llm_args import (TORCH_LLMARGS_EXPLICIT_DOCSTRING,
TRT_LLMARGS_EXPLICIT_DOCSTRING, PybindMirror,
-
TorchLlmArgs, TrtLlmArgs, _AutoDeployLlmArgs)
+
TorchLlmArgs, TrtLlmArgs)
from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig,
LlmBuildStats, ModelLoader, _ModelRuntimeContext)
from .mpi_session import MpiPoolSession, external_mpi_comm_available
@@ -643,7 +641,9 @@
if backend == 'pytorch':
llm_args_cls = TorchLlmArgs
elif backend == '_autodeploy':
-
llm_args_cls = _AutoDeployLlmArgs
+
from .._torch.auto_deploy.llm_args import \
+
LlmArgs as AutoDeployLlmArgs
+
llm_args_cls = AutoDeployLlmArgs
else:
llm_args_cls = TrtLlmArgs
@@ -1047,8 +1047,6 @@
f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
)
return
-
elif self.args.backend == "_autodeploy":
-
return
build_config = self.args.build_config
@@ -1108,134 +1106,6 @@
llm_build_stats=weakref.proxy(
self.llm_build_stats))
self._engine_dir, self._hf_model_dir = model_loader()
-
# update the model_dir to a local dir for the runtime, such as tokenizer loading.
-
if self._engine_dir is not None:
-
self.args.model = self._engine_dir
-
-
# Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub.
-
# It should also be before bindings ExecutorConfig, which may depend on tokenizer info.
-
self._tokenizer = self._try_load_tokenizer()
-
-
# Multimodal special handling:
-
# 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor
-
# 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__
-
self.input_processor = create_input_processor(self._hf_model_dir,
-
self.tokenizer)
-
self.tokenizer = self.input_processor.tokenizer
-
-
max_batch_size = self.args.max_batch_size
-
max_num_tokens = self.args.max_num_tokens
-
max_seq_len = self.args.max_seq_len
-
-
build_config = self.args.build_config if self._on_trt_backend else BuildConfig(
-
)
-
-
max_batch_size = max_batch_size or build_config.max_batch_size
-
max_num_tokens = max_num_tokens or build_config.max_num_tokens
-
max_seq_len = max_seq_len or build_config.max_seq_len
-
-
self._executor_config = tllm.ExecutorConfig(
-
max_beam_width=self.args.max_beam_width,
-
scheduler_config=PybindMirror.maybe_to_pybind(
-
self.args.scheduler_config),
-
batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
-
or tllm.BatchingType.INFLIGHT,
-
max_batch_size=max_batch_size,
-
max_num_tokens=max_num_tokens,
-
gather_generation_logits=self.args.gather_generation_logits)
-
if self.args.backend is None:
-
# also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens
-
if max_seq_len is not None:
-
self._executor_config.max_seq_len = max_seq_len
-
else:
-
engine_config = EngineConfig.from_json_file(self._engine_dir /
-
"config.json")
-
self._executor_config.max_seq_len = engine_config.build_config.max_seq_len
-
if self.args.kv_cache_config is not None:
-
self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
-
self.args.kv_cache_config)
-
if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
-
# Disable KV cache reuse for deterministic mode
-
self._executor_config.kv_cache_config.enable_block_reuse = False
-
self._executor_config.kv_cache_config.enable_partial_reuse = False
-
if self.args.peft_cache_config is not None:
-
self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
-
self.args.peft_cache_config)
-
elif self._on_trt_backend and self.args.build_config.plugin_config.lora_plugin:
-
engine_config = EngineConfig.from_json_file(self._engine_dir /
-
"config.json")
-
lora_config = engine_config.build_config.lora_config
-
max_lora_rank = lora_config.max_lora_rank
-
num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \
-
len(lora_config.lora_target_modules + lora_config.missing_qkv_modules)
-
self._executor_config.peft_cache_config = tllm.PeftCacheConfig(
-
num_device_module_layer=max_lora_rank * num_lora_modules *
-
self.args.max_loras,
-
num_host_module_layer=max_lora_rank * num_lora_modules *
-
self.args.max_cpu_loras,
-
)
-
if self.args.decoding_config is not None:
-
self._executor_config.decoding_config = self.args.decoding_config
-
if self.args.guided_decoding_backend == 'xgrammar':
-
self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
-
backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
-
XGRAMMAR,
-
**_xgrammar_tokenizer_info(self.tokenizer))
-
elif self.args.guided_decoding_backend == 'llguidance':
-
self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
-
backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
-
LLGUIDANCE,
-
**_llguidance_tokenizer_info(self.tokenizer))
-
elif self.args.guided_decoding_backend is not None:
-
raise ValueError(
-
f"Unrecognized guided decoding backend {self.args.guided_decoding_backend}"
-
)
-
-
self._executor_config.normalize_log_probs = self.args.normalize_log_probs
-
self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
-
self._executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width
-
if self._on_trt_backend and self.args.extended_runtime_perf_knob_config is not None:
-
self._executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind(
-
self.args.extended_runtime_perf_knob_config)
-
if self.args.cache_transceiver_config is not None:
-
self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
-
self.args.cache_transceiver_config)
-
from tensorrt_llm._torch.pyexecutor.config import update_executor_config
-
update_executor_config(
-
self._executor_config,
-
backend=self.args.backend,
-
pytorch_backend_config=self.args.get_pytorch_backend_config()
-
if self.args.backend in ["pytorch", "_autodeploy"] else None,
-
mapping=self.args.parallel_config.to_mapping(),
-
build_config=self.args.build_config
-
if self._on_trt_backend else None,
-
speculative_config=self.args.speculative_config,
-
hf_model_dir=self._hf_model_dir,
-
trt_engine_dir=self._engine_dir,
-
max_input_len=self.args.max_input_len,
-
max_seq_len=max_seq_len)
-
self._executor_config.llm_parallel_config = self.args.parallel_config
-
return_logits = (self.args.gather_generation_logits
-
or (self.args.build_config
-
and self.args.build_config.gather_context_logits))
-
-
self._executor = self._executor_cls.create(
-
self._engine_dir,
-
executor_config=self._executor_config,
-
batched_logits_processor=self.args.batched_logits_processor,
-
model_world_size=self.args.parallel_config.world_size,
-
mpi_session=self.mpi_session,
-
reuse_mpi_comm=external_mpi_comm_available(
-
self.args.parallel_config.world_size),
-
return_logits=return_logits,
-
postproc_worker_config=PostprocWorkerConfig(
-
num_postprocess_workers=self.args.num_postprocess_workers,
-
postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,
-
),
-
is_llm_executor=True,
-
lora_config=self.args.lora_config,
-
garbage_collection_gen0_threshold=self.args.
-
garbage_collection_gen0_threshold)
@property
def _on_trt_backend(self) -> bool:
@@ -1376,6 +1246,116 @@
f"Copying {file} to {target_engine_dir / file.name}\n")
shutil.copy(file, target_engine_dir / file.name)
+
def _build_model(self):
+
super()._build_model()
+
# update the model_dir to a local dir for the runtime, such as tokenizer loading.
+
if self._engine_dir is not None:
+
self.args.model = self._engine_dir
+
+
# Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub.
+
# It should also be before bindings ExecutorConfig, which may depend on tokenizer info.
+
self._tokenizer = self._try_load_tokenizer()
+
+
# Multimodal special handling:
+
# 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor
+
# 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__
+
self.input_processor = create_input_processor(self._hf_model_dir,
+
self.tokenizer)
+
self.tokenizer = self.input_processor.tokenizer
+
+
max_batch_size = self.args.max_batch_size
+
max_num_tokens = self.args.max_num_tokens
+
max_seq_len = self.args.max_seq_len
+
+
build_config = self.args.build_config
+
+
max_batch_size = max_batch_size or build_config.max_batch_size
+
max_num_tokens = max_num_tokens or build_config.max_num_tokens
+
max_seq_len = max_seq_len or build_config.max_seq_len
+
+
self._executor_config = tllm.ExecutorConfig(
+
max_beam_width=self.args.max_beam_width,
+
scheduler_config=PybindMirror.maybe_to_pybind(
+
self.args.scheduler_config),
+
batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
+
or tllm.BatchingType.INFLIGHT,
+
max_batch_size=max_batch_size,
+
max_num_tokens=max_num_tokens,
+
gather_generation_logits=self.args.gather_generation_logits)
+
+
# also set executor_config.max_seq_len in TRT workflow, to deduce default max_tokens
+
if max_seq_len is not None:
+
self._executor_config.max_seq_len = max_seq_len
+
else:
+
engine_config = EngineConfig.from_json_file(self._engine_dir /
+
"config.json")
+
self._executor_config.max_seq_len = engine_config.build_config.max_seq_len
+
+
if self.args.kv_cache_config is not None:
+
self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
+
self.args.kv_cache_config)
+
if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
+
# Disable KV cache reuse for deterministic mode
+
self._executor_config.kv_cache_config.enable_block_reuse = False
+
self._executor_config.kv_cache_config.enable_partial_reuse = False
+
if self.args.peft_cache_config is not None:
+
self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
+
self.args.peft_cache_config)
+
elif self.args.build_config.plugin_config.lora_plugin:
+
engine_config = EngineConfig.from_json_file(self._engine_dir /
+
"config.json")
+
lora_config = engine_config.build_config.lora_config
+
max_lora_rank = lora_config.max_lora_rank
+
num_lora_modules = engine_config.pretrained_config.num_hidden_layers * \
+
len(lora_config.lora_target_modules + lora_config.missing_qkv_modules)
+
self._executor_config.peft_cache_config = tllm.PeftCacheConfig(
+
num_device_module_layer=max_lora_rank * num_lora_modules *
+
self.args.max_loras,
+
num_host_module_layer=max_lora_rank * num_lora_modules *
+
self.args.max_cpu_loras,
+
)
+
if self.args.decoding_config is not None:
+
self._executor_config.decoding_config = self.args.decoding_config
+
if self.args.guided_decoding_backend == 'xgrammar':
+
self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+
backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+
XGRAMMAR,
+
**_xgrammar_tokenizer_info(self.tokenizer))
+
elif self.args.guided_decoding_backend is not None:
+
raise ValueError(
+
f"Unsupported guided decoding backend {self.args.guided_decoding_backend}"
+
)
+
+
self._executor_config.normalize_log_probs = self.args.normalize_log_probs
+
self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
+
self._executor_config.max_beam_width = self.args.max_beam_width or self.args.build_config.max_beam_width
+
if self.args.extended_runtime_perf_knob_config is not None:
+
self._executor_config.extended_runtime_perf_knob_config = PybindMirror.maybe_to_pybind(
+
self.args.extended_runtime_perf_knob_config)
+
if self.args.cache_transceiver_config is not None:
+
self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
+
self.args.cache_transceiver_config)
+
self._executor_config.llm_parallel_config = self.args.parallel_config
+
return_logits = (self.args.gather_generation_logits
+
or (self.args.build_config
+
and self.args.build_config.gather_context_logits))
+
+
self._executor = self._executor_cls.create(
+
self._engine_dir,
+
executor_config=self._executor_config,
+
batched_logits_processor=self.args.batched_logits_processor,
+
model_world_size=self.args.parallel_config.world_size,
+
mpi_session=self.mpi_session,
+
reuse_mpi_comm=external_mpi_comm_available(
+
self.args.parallel_config.world_size),
+
return_logits=return_logits,
+
postproc_worker_config=PostprocWorkerConfig(
+
num_postprocess_workers=self.args.num_postprocess_workers,
+
postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,
+
),
+
is_llm_executor=True,
+
lora_config=self.args.lora_config)
+
@append_docstring(TORCH_LLM_DOCSTRING)
class _TorchLLM(BaseLLM):
@@ -1398,7 +1378,7 @@
**kwargs: Any) -> None:
# TODO: deprecate backend in LLM kwargs
-
kwargs.pop("backend", None)
+
backend = kwargs.pop("backend", "pytorch")
# Validate that users don't pass TrtLlmArgs-specific arguments
self._validate_args_for_torch_backend(kwargs)
@@ -1412,9 +1392,104 @@
dtype,
revision,
tokenizer_revision,
-
backend='pytorch',
+
backend=backend,
**kwargs)
+
def _build_model(self):
+
super()._build_model()
+
assert self._engine_dir is None
+
+
# Tokenizer loading should be after calling model_loader(), since model_loader() may download the model from HF hub.
+
# It should also be before bindings ExecutorConfig, which may depend on tokenizer info.
+
self._tokenizer = self._try_load_tokenizer()
+
+
# Multimodal special handling:
+
# 1. Default load_tokenizer may fail because MM has different tokenizer configuration. Hence we initialize it inside input processor
+
# 2. May need to modify model weights for MM (e.g., resize vocab embedding). We must do such operation via input processor's __init__
+
self.input_processor = create_input_processor(self._hf_model_dir,
+
self.tokenizer)
+
self.tokenizer = self.input_processor.tokenizer
+
+
max_batch_size = self.args.max_batch_size
+
max_num_tokens = self.args.max_num_tokens
+
max_seq_len = self.args.max_seq_len
+
+
self._executor_config = tllm.ExecutorConfig(
+
max_beam_width=self.args.max_beam_width,
+
scheduler_config=PybindMirror.maybe_to_pybind(
+
self.args.scheduler_config),
+
batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
+
or tllm.BatchingType.INFLIGHT,
+
max_batch_size=max_batch_size,
+
max_num_tokens=max_num_tokens,
+
gather_generation_logits=self.args.gather_generation_logits)
+
+
if self.args.kv_cache_config is not None:
+
self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
+
self.args.kv_cache_config)
+
if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
+
# Disable KV cache reuse for deterministic mode
+
self._executor_config.kv_cache_config.enable_block_reuse = False
+
self._executor_config.kv_cache_config.enable_partial_reuse = False
+
if self.args.peft_cache_config is not None:
+
self._executor_config.peft_cache_config = PybindMirror.maybe_to_pybind(
+
self.args.peft_cache_config)
+
if self.args.decoding_config is not None:
+
self._executor_config.decoding_config = self.args.decoding_config
+
if self.args.guided_decoding_backend == 'xgrammar':
+
self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+
backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+
XGRAMMAR,
+
**_xgrammar_tokenizer_info(self.tokenizer))
+
elif self.args.guided_decoding_backend == 'llguidance':
+
self._executor_config.guided_decoding_config = tllm.GuidedDecodingConfig(
+
backend=tllm.GuidedDecodingConfig.GuidedDecodingBackend.
+
LLGUIDANCE,
+
**_llguidance_tokenizer_info(self.tokenizer))
+
elif self.args.guided_decoding_backend is not None:
+
raise ValueError(
+
f"Unsupported guided decoding backend {self.args.guided_decoding_backend}"
+
)
+
+
self._executor_config.normalize_log_probs = self.args.normalize_log_probs
+
self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
+
self._executor_config.max_beam_width = self.args.max_beam_width
+
if self.args.cache_transceiver_config is not None:
+
self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
+
self.args.cache_transceiver_config)
+
from tensorrt_llm._torch.pyexecutor.config import update_executor_config
+
update_executor_config(
+
self._executor_config,
+
backend=self.args.backend,
+
pytorch_backend_config=self.args.get_pytorch_backend_config()
+
if self.args.backend in ["pytorch", "_autodeploy"] else None,
+
mapping=self.args.parallel_config.to_mapping(),
+
speculative_config=self.args.speculative_config,
+
hf_model_dir=self._hf_model_dir,
+
max_input_len=self.args.max_input_len,
+
max_seq_len=max_seq_len)
+
+
# TODO: revisit gather_context_logits
+
return_logits = self.args.gather_generation_logits
+
+
self._executor = self._executor_cls.create(
+
self._engine_dir,
+
executor_config=self._executor_config,
+
batched_logits_processor=self.args.batched_logits_processor,
+
model_world_size=self.args.parallel_config.world_size,
+
mpi_session=self.mpi_session,
+
reuse_mpi_comm=external_mpi_comm_available(
+
self.args.parallel_config.world_size),
+
return_logits=return_logits,
+
postproc_worker_config=PostprocWorkerConfig(
+
num_postprocess_workers=self.args.num_postprocess_workers,
+
postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,
+
),
+
is_llm_executor=True,
+
lora_config=self.args.lora_config,
+
garbage_collection_gen0_threshold=self.args.
+
garbage_collection_gen0_threshold)
+
def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
"""Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.
"""
@@ -1579,9 +1654,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/llm_args.html b/latest/_modules/tensorrt_llm/llmapi/llm_args.html
index 005d9e8e3e..e4366771a8 100644
--- a/latest/_modules/tensorrt_llm/llmapi/llm_args.html
+++ b/latest/_modules/tensorrt_llm/llmapi/llm_args.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -517,7 +517,7 @@
from enum import Enum, EnumMeta
from pathlib import Path
from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional,
-
Union)
+
TypeAlias, Union)
import torch
import yaml
@@ -564,8 +564,7 @@
from ..sampling_params import BatchedLogitsProcessor
from .build_cache import BuildCacheConfig
from .tokenizer import TokenizerBase, tokenizer_factory
-
from .utils import (generate_api_docs_as_docstring, get_type_repr,
-
print_traceback_on_error)
+
from .utils import generate_api_docs_as_docstring, get_type_repr
# TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import
@@ -1178,6 +1177,16 @@
+
SpeculativeConfig: TypeAlias = Optional[Union[
+
DraftTargetDecodingConfig,
+
EagleDecodingConfig,
+
LookaheadDecodingConfig,
+
MedusaDecodingConfig,
+
MTPDecodingConfig,
+
NGramDecodingConfig,
+
]]
+
+
[docs]
@PybindMirror.mirror_pybind_fields(_KvCacheConfig)
@@ -1239,6 +1248,8 @@
description=
"Whether partially matched blocks that are in use can be reused after copying them."
)
+
use_uvm: bool = Field(default=False,
+
description="Whether to use UVM for the KV cache.")
def _to_pybind(self):
return _KvCacheConfig(
@@ -1253,7 +1264,8 @@
secondary_offload_min_priority=self.secondary_offload_min_priority,
event_buffer_max_size=self.event_buffer_max_size,
enable_partial_reuse=self.enable_partial_reuse,
-
copy_on_partial_reuse=self.copy_on_partial_reuse)
+
copy_on_partial_reuse=self.copy_on_partial_reuse,
+
use_uvm=self.use_uvm)
@@ -1467,8 +1479,11 @@
[docs]
class TorchLlmArgs(BaseLlmArgs):
-
# Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
build_config: Optional[object] = Field(
default=None,
@@ -2239,6 +2244,12 @@
# PyTorch backend specific configurations
+
garbage_collection_gen0_threshold: int = Field(
+
default=20000,
+
description=
+
"Threshold for Python garbage collection of generation 0 objects."
+
"Lower values trigger more frequent garbage collection.")
+
use_cuda_graph: bool = Field(
default=False,
description=
@@ -2531,115 +2542,6 @@
-
class _AutoDeployLlmArgs(TorchLlmArgs):
-
"""LLM arguments specifically for AutoDeploy backend.
-
-
This class extends TorchLlmArgs with AutoDeploy-specific configuration options.
-
AutoDeploy provides automatic deployment and optimization of language models
-
with various attention backends and optimization strategies.
-
"""
-
-
model_factory: Literal[
-
"AutoModelForCausalLM", "AutoModelForImageTextToText"] = Field(
-
default="AutoModelForCausalLM",
-
description="The model factory to use for loading the model.",
-
)
-
-
model_kwargs: Dict[str, Any] = Field(
-
default_factory=dict,
-
description=
-
"Extra kwargs for the model config class to customize the model config. "
-
"These arguments take precedence over default values or config values in the model config "
-
"file. Arguments are resolved in order: 1) Default values in model config class, 2) Values "
-
"in model config file, 3) Values in model_kwargs. Note: if a kwarg doesn't exist in the "
-
"model config class, it will be ignored.",
-
)
-
-
mla_backend: Literal["MultiHeadLatentAttention"] = Field(
-
default="MultiHeadLatentAttention",
-
description="The Multi-Head Latent Attention backend to use.",
-
)
-
-
skip_loading_weights: bool = Field(
-
default=False,
-
description=
-
"Whether to skip loading model weights during initialization. "
-
"If True, only the model architecture is loaded.",
-
)
-
-
free_mem_ratio: float = Field(
-
default=0.8,
-
description="The fraction of available memory to allocate for cache. "
-
"Must be between 0.0 and 1.0.",
-
)
-
-
simple_shard_only: bool = Field(
-
default=False,
-
description=
-
"If True, force simple sharding (all_gather) in tensor parallelism. "
-
"If False, auto-detect and use column+row (all_reduce) sharding when possible.",
-
)
-
-
# TODO: Remove this field once tokens_per_block is properly passed through
-
attn_page_size: int = Field(
-
default=64,
-
description=
-
"Page size for attention (tokens_per_block). For TritonWithFlattenedInputs "
-
"backend, this should equal max_seq_len. Temporary field until tokens_per_block gets "
-
"properly passed through.",
-
)
-
-
checkpoint_device: Optional[str] = Field(
-
default=None,
-
description="Device on which to load the model checkpoint. "
-
"Defaults to the same device as the rest of the pipeline.",
-
)
-
-
@field_validator("free_mem_ratio")
-
@classmethod
-
def validate_free_mem_ratio(cls, v):
-
"""Validate that free_mem_ratio is between 0.0 and 1.0."""
-
if not 0.0 <= v <= 1.0:
-
raise ValueError(
-
f"free_mem_ratio must be between 0.0 and 1.0, got {v}")
-
return v
-
-
@print_traceback_on_error
-
def model_post_init(self, __context):
-
# Modify default values that differ from TorchLlmArgs
-
new_defaults = {
-
"max_batch_size": 8,
-
"max_seq_len": 512,
-
"attn_backend": "FlashInfer",
-
# TODO: Remove this when overlap scheduler is supported (https://github.com/NVIDIA/TensorRT-LLM/issues/4364)
-
"disable_overlap_scheduler": True,
-
}
-
for k, v_default in new_defaults.items():
-
if k not in self.__pydantic_fields_set__:
-
setattr(self, k, v_default)
-
-
# NOTE: Only call super() after setting the default values since default values should be
-
# set first.
-
super().model_post_init(__context)
-
-
# Handle attn_page_size for TritonWithFlattenedInputs backend
-
if self.attn_backend == "TritonWithFlattenedInputs":
-
self.attn_page_size = self.max_seq_len
-
-
# Add max_position_embeddings to model_kwargs
-
# TODO (lucaslie): this is more HF specific than a generic model_kwargs. Ideally, we can
-
# move this to the HF model factory but we don't have access to max_seq_len there right now.
-
self.model_kwargs["max_position_embeddings"] = min(
-
self.max_seq_len,
-
self.model_kwargs.get("max_position_embeddings", self.max_seq_len),
-
)
-
-
# TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
-
def get_pytorch_backend_config(self) -> "_AutoDeployLlmArgs":
-
"""Return the _AutoDeployLlmArgs (self) object."""
-
return self
-
-
def update_llm_args_with_extra_dict(
llm_args: Dict,
llm_args_dict: Dict,
@@ -2831,9 +2733,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
index 3fa5c2078d..c0e69fa75c 100644
--- a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
+++ b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1149,9 +1149,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/baichuan/model.html b/latest/_modules/tensorrt_llm/models/baichuan/model.html
index a586074d52..c32839c332 100644
--- a/latest/_modules/tensorrt_llm/models/baichuan/model.html
+++ b/latest/_modules/tensorrt_llm/models/baichuan/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -881,9 +881,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/bert/model.html b/latest/_modules/tensorrt_llm/models/bert/model.html
index 9e5bac9011..d7575d75b5 100644
--- a/latest/_modules/tensorrt_llm/models/bert/model.html
+++ b/latest/_modules/tensorrt_llm/models/bert/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1185,9 +1185,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/bloom/model.html b/latest/_modules/tensorrt_llm/models/bloom/model.html
index 70d50270c3..5ea89b2b92 100644
--- a/latest/_modules/tensorrt_llm/models/bloom/model.html
+++ b/latest/_modules/tensorrt_llm/models/bloom/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -793,9 +793,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/config.html b/latest/_modules/tensorrt_llm/models/chatglm/config.html
index 79655aee0d..33b4eab4cc 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/config.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -810,9 +810,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/model.html b/latest/_modules/tensorrt_llm/models/chatglm/model.html
index 64f38de5df..bf87147bae 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/model.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1009,9 +1009,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/clip/model.html b/latest/_modules/tensorrt_llm/models/clip/model.html
index 259c6bfe20..7e937a4d75 100644
--- a/latest/_modules/tensorrt_llm/models/clip/model.html
+++ b/latest/_modules/tensorrt_llm/models/clip/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -838,9 +838,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/config.html b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
index adc6c1be4f..dc14e3cef0 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/config.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -669,9 +669,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/model.html b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
index f61e940b27..33fbb47072 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/model.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -922,9 +922,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/commandr/model.html b/latest/_modules/tensorrt_llm/models/commandr/model.html
index 6416fd4239..1d6a1d00d1 100644
--- a/latest/_modules/tensorrt_llm/models/commandr/model.html
+++ b/latest/_modules/tensorrt_llm/models/commandr/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -820,9 +820,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/config.html b/latest/_modules/tensorrt_llm/models/dbrx/config.html
index 1456bee197..83c8ea97bf 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/config.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -684,9 +684,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/model.html b/latest/_modules/tensorrt_llm/models/dbrx/model.html
index f3e26bd173..d40fe208d7 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/model.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -810,9 +810,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
index 7423a59470..e155b82ea2 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -904,9 +904,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
index eda762b669..2ac1567401 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -986,9 +986,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/dit/model.html b/latest/_modules/tensorrt_llm/models/dit/model.html
index 4515b184cf..e6586cb67e 100644
--- a/latest/_modules/tensorrt_llm/models/dit/model.html
+++ b/latest/_modules/tensorrt_llm/models/dit/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1022,9 +1022,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/eagle/model.html b/latest/_modules/tensorrt_llm/models/eagle/model.html
index 566feb2384..563f3b7701 100644
--- a/latest/_modules/tensorrt_llm/models/eagle/model.html
+++ b/latest/_modules/tensorrt_llm/models/eagle/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1958,9 +1958,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/enc_dec/model.html b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
index 4e8f487132..368895e61d 100644
--- a/latest/_modules/tensorrt_llm/models/enc_dec/model.html
+++ b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -2863,9 +2863,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/falcon/config.html b/latest/_modules/tensorrt_llm/models/falcon/config.html
index 99e1bf8b31..df622f4b18 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/config.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -745,9 +745,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/falcon/model.html b/latest/_modules/tensorrt_llm/models/falcon/model.html
index 002786628a..341cb17e5f 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/model.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -907,9 +907,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gemma/config.html b/latest/_modules/tensorrt_llm/models/gemma/config.html
index 20720027d0..dbb7708cc2 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/config.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -835,9 +835,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gemma/model.html b/latest/_modules/tensorrt_llm/models/gemma/model.html
index d6733f96c6..6dcd05688c 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1027,9 +1027,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gpt/config.html b/latest/_modules/tensorrt_llm/models/gpt/config.html
index 48e90b4246..eee8794c63 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/config.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -954,9 +954,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gpt/model.html b/latest/_modules/tensorrt_llm/models/gpt/model.html
index db58e938b4..66700fe4b8 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1057,9 +1057,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptj/config.html b/latest/_modules/tensorrt_llm/models/gptj/config.html
index 05b5453c1f..8333e21704 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/config.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -683,9 +683,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptj/model.html b/latest/_modules/tensorrt_llm/models/gptj/model.html
index 0793c28d85..80bab7b377 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -835,9 +835,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/gptneox/model.html b/latest/_modules/tensorrt_llm/models/gptneox/model.html
index 868013cc92..1511f6beff 100644
--- a/latest/_modules/tensorrt_llm/models/gptneox/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptneox/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -775,9 +775,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/llama/config.html b/latest/_modules/tensorrt_llm/models/llama/config.html
index ca8a536bc3..b5100ba012 100644
--- a/latest/_modules/tensorrt_llm/models/llama/config.html
+++ b/latest/_modules/tensorrt_llm/models/llama/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -909,9 +909,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/llama/model.html b/latest/_modules/tensorrt_llm/models/llama/model.html
index b7c9994c49..36cc9fc2e2 100644
--- a/latest/_modules/tensorrt_llm/models/llama/model.html
+++ b/latest/_modules/tensorrt_llm/models/llama/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1257,9 +1257,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mamba/model.html b/latest/_modules/tensorrt_llm/models/mamba/model.html
index 0958c620af..c9519d2acb 100644
--- a/latest/_modules/tensorrt_llm/models/mamba/model.html
+++ b/latest/_modules/tensorrt_llm/models/mamba/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1102,9 +1102,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/medusa/config.html b/latest/_modules/tensorrt_llm/models/medusa/config.html
index bbceb7bbe0..766732eee5 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/config.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -742,9 +742,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/medusa/model.html b/latest/_modules/tensorrt_llm/models/medusa/model.html
index 6343c5fc06..28f9bebf4f 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/model.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -892,9 +892,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mllama/model.html b/latest/_modules/tensorrt_llm/models/mllama/model.html
index 6e573c3496..5c64131007 100644
--- a/latest/_modules/tensorrt_llm/models/mllama/model.html
+++ b/latest/_modules/tensorrt_llm/models/mllama/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -2203,9 +2203,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
index 8e038f8f1b..e4ab9fe748 100644
--- a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
+++ b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1269,9 +1269,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/modeling_utils.html b/latest/_modules/tensorrt_llm/models/modeling_utils.html
index bf3b835ed1..3ccb1da008 100644
--- a/latest/_modules/tensorrt_llm/models/modeling_utils.html
+++ b/latest/_modules/tensorrt_llm/models/modeling_utils.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -2664,9 +2664,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/mpt/model.html b/latest/_modules/tensorrt_llm/models/mpt/model.html
index 5adbf7a9bf..eeffa7957e 100644
--- a/latest/_modules/tensorrt_llm/models/mpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/mpt/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -807,9 +807,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
index 49f58e4721..91fc451176 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -741,9 +741,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
index af1f2cd750..1adb9ea6ab 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -809,9 +809,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/opt/model.html b/latest/_modules/tensorrt_llm/models/opt/model.html
index 6a79ec46de..db9822d7d4 100644
--- a/latest/_modules/tensorrt_llm/models/opt/model.html
+++ b/latest/_modules/tensorrt_llm/models/opt/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -812,9 +812,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/phi/model.html b/latest/_modules/tensorrt_llm/models/phi/model.html
index 12c0ed3fc2..6c87fa0144 100644
--- a/latest/_modules/tensorrt_llm/models/phi/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -856,9 +856,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/phi3/model.html b/latest/_modules/tensorrt_llm/models/phi3/model.html
index 69c6bedc94..be16c70366 100644
--- a/latest/_modules/tensorrt_llm/models/phi3/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi3/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -952,9 +952,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
index 2a8daca22e..8046843cf1 100644
--- a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -1255,9 +1255,9 @@
diff --git a/latest/_modules/tensorrt_llm/models/redrafter/model.html b/latest/_modules/tensorrt_llm/models/redrafter/model.html
index b6d8f3b245..b5b15c6208 100644
--- a/latest/_modules/tensorrt_llm/models/redrafter/model.html
+++ b/latest/_modules/tensorrt_llm/models/redrafter/model.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@
-
+
@@ -530,7 +530,7 @@
from tensorrt_llm._common import default_net
from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.functional import Tensor, cast, categorical_sample
-
from tensorrt_llm.models import LLaMAForCausalLM
+
from tensorrt_llm.models import LLaMAForCausalLM, QWenForCausalLM
from tensorrt_llm.models.generation_mixin import GenerationMixin
from ..._utils import pad_vocab_size, str_dtype_to_trt
@@ -539,9 +539,7 @@
_process_logits_and_hidden_states)
-
-
[docs]
-
class ReDrafterForCausalLM(LLaMAForCausalLM):
+
class ReDrafterMixin:
def __init__(self, config):
@@ -624,8 +622,6 @@
return next_token, probs, draft_input
-
-
[docs]
def forward(self, *args, **kwargs):
"""
0. run base model, get logits, hidden_states
@@ -661,11 +657,8 @@
self.drafter,
kwargs=kwargs)
-
return new_draft_tokens, new_draft_logits, probs
+
return new_draft_tokens, new_draft_logits, probs
-
-
-
+
return inputs
+
+
+
+
[docs]
+
class ReDrafterForQWenLM(ReDrafterMixin, QWenForCausalLM):
+
"""ReDrafter implementation for QWen models.
+
+
Combines:
+
- Base QWen model functionality from QWenForCausalLM
+
- Drafting/speculative decoding logic from ReDrafterMixin
+
"""
+
+
+
+
+
[docs]
+
class ReDrafterForLLaMALM(ReDrafterMixin, LLaMAForCausalLM):
+
"""ReDrafter implementation for LLaMA models.
+
+
Combines:
+
- Base LLaMA model functionality from LLaMAForCausalLM
+
- Drafting/speculative decoding logic from ReDrafterMixin
+
"""
@@ -927,9 +942,9 @@
diff --git a/latest/_modules/tensorrt_llm/plugin/plugin.html b/latest/_modules/tensorrt_llm/plugin/plugin.html
index 611976cad0..834603105c 100644
--- a/latest/_modules/tensorrt_llm/plugin/plugin.html
+++ b/latest/_modules/tensorrt_llm/plugin/plugin.html
@@ -50,7 +50,7 @@
@@ -60,7 +60,7 @@