@@ -14756,9 +14760,9 @@ one more than decoding draft tokens for prediction from primary head
diff --git a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
index f736ce8ce4..50710d6d4f 100644
--- a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
+++ b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
@@ -127,6 +127,8 @@ class Attention(nn.Module):
q_scaling: float = 1.0,
attention_chunk_size: Optional[int] = None,
disable_deep_gemm: bool = False,
+ attn_output_gate: Optional[bool] = None,
+ use_custom_cublas_mm: bool = False,
):
"""
Initialize the Attention module.
@@ -146,6 +148,7 @@ class Attention(nn.Module):
q_scaling (float): The scaling factor for the qk_scale. The definition is $O = softmax(QK^T * qk_scale) * V, qk_scale = 1 / (sqrt(head_dim) * q_scaling)$. The default value is 1.0.
attention_chunk_size (Optional[int]): See [Chunked Attention] below.
disable_deep_gemm (bool): Whether to disable the use of DeepGEMM in Linear layers (currently only matters on SM100 + FP8).
+ attn_output_gate (Optional[bool]): Determines whether to use an output gate in the attention Op. If False, the decision is automatically handled by the attention backend based on its capabilities.
"""
super().__init__()
self.layer_idx = layer_idx
@@ -172,6 +175,10 @@ class Attention(nn.Module):
self.pos_embd_params = pos_embd_params
self.dense_bias = dense_bias
self.q_scaling = q_scaling
+ self.attn_output_gate = attn_output_gate
+
+ if self.attn_output_gate:
+ logger.info_once("using attn output gate!", key="attn_output_gate")
# [Chunked Attention]
# Chunked attention is applied to context requests only. Chunked attention will be
@@ -217,7 +224,8 @@ class Attention(nn.Module):
self.qkv_proj = Linear(
self.hidden_size,
- tp_size * self.q_size + 2 * tp_size * self.kv_size,
+ tp_size * self.q_size * (2 if self.attn_output_gate else 1) +
+ 2 * tp_size * self.kv_size,
bias=bias,
dtype=dtype,
mapping=mapping,
@@ -229,7 +237,7 @@ class Attention(nn.Module):
allreduce_strategy=config.allreduce_strategy,
force_dynamic_quantization=config.force_dynamic_quantization,
disable_deep_gemm=disable_deep_gemm,
- )
+ use_custom_cublas_mm=use_custom_cublas_mm)
self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
[self.hidden_size])
@@ -247,11 +255,13 @@ class Attention(nn.Module):
allreduce_strategy=config.allreduce_strategy,
force_dynamic_quantization=config.force_dynamic_quantization,
disable_deep_gemm=disable_deep_gemm,
- )
+ use_custom_cublas_mm=use_custom_cublas_mm)
self.quant_config = config.get_quant_config()
self.attn_backend = config.attn_backend
- attn_cls = get_attention_backend(self.attn_backend)
+ attn_cls = get_attention_backend(
+ self.attn_backend,
+ sparse_attn_config=config.sparse_attention_config)
# These two modules are mutually exclusive - either splitted_qkv_lora or fused_qkv_lora will be used,
# but never both at the same time. splitted_qkv_lora handles Q,K,V separately while fused_qkv_lora
@@ -269,6 +279,9 @@ class Attention(nn.Module):
# Whether to fuse RoPE into the attention OP.
# If true, RoPE will be applied in self.attn.forward.
# If false, RoPE will be applied in self.apply_rope.
+ if config.sparse_attention_config is not None:
+ logger.warning("disable rope_fusion for sparse attention.")
+ rope_fusion = False
self.rope_fusion = rope_fusion
if self.rope_fusion and not attn_cls.support_fused_rope():
logger.warning(
@@ -306,6 +319,7 @@ class Attention(nn.Module):
skip_create_weights_in_init=config.skip_create_weights_in_init,
q_scaling=self.q_scaling,
attention_chunk_size=self.attention_chunk_size,
+ sparse_attention_config=config.sparse_attention_config,
)
self.support_fused_qkv = self.attn.support_fused_qkv()
@@ -521,24 +535,39 @@ class Attention(nn.Module):
if qkv_lora is not None:
qkv = qkv + qkv_lora
- q, k, v = qkv, None, None
+ if self.attn_output_gate:
+ q_gate, k, v = qkv.split(
+ [self.q_size * 2, self.kv_size, self.kv_size], dim=-1)
+ orig_shape = q_gate.shape[:-1]
+ # Single line: view -> chunk -> reshape both q and gate
+ q, gate = [
+ t.reshape(*orig_shape, -1) for t in torch.chunk(
+ q_gate.view(*orig_shape, self.num_heads, -1), 2, dim=-1)
+ ]
+ else:
+ q, k, v = qkv, None, None
+
q, k, v = self.apply_rope(q, k, v, position_ids)
q, k, v = self.convert_qkv(q, k, v)
if attention_sinks is not None:
assert self.attn_backend == "TRTLLM", "Attention sinks are only supported for TRTLLM backend."
- output = self.forward_impl(q,
- k,
- v,
- attn_metadata,
- attention_mask,
- attention_window_size,
- attention_mask_data,
- mrope_config=mrope_config,
- attention_sinks=attention_sinks)
+ attn_output = self.forward_impl(q,
+ k,
+ v,
+ attn_metadata,
+ attention_mask,
+ attention_window_size,
+ attention_mask_data,
+ mrope_config=mrope_config,
+ attention_sinks=attention_sinks)
- attn_output = self.o_proj(output,
+ if self.attn_output_gate:
+ gate = torch.sigmoid(gate)
+ attn_output = attn_output * gate
+
+ attn_output = self.o_proj(attn_output,
all_reduce_params=all_reduce_params,
lora_params=lora_params,
layer_idx=self.layer_idx)
@@ -831,6 +860,7 @@ class MLA(nn.Module):
v_head_dim=self.v_head_dim,
predicted_tokens_per_seq=self.predicted_tokens_per_seq,
skip_create_weights_in_init=config.skip_create_weights_in_init,
+ sparse_attention_config=config.sparse_attention_config,
)
self.mqa = create_attention(
@@ -850,6 +880,7 @@ class MLA(nn.Module):
v_head_dim=self.kv_lora_rank,
predicted_tokens_per_seq=self.predicted_tokens_per_seq,
skip_create_weights_in_init=config.skip_create_weights_in_init,
+ sparse_attention_config=config.sparse_attention_config,
)
self.aux_stream = aux_stream
diff --git a/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py b/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py
index a0f650f6af..f1d6146256 100644
--- a/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py
+++ b/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py
@@ -18,6 +18,8 @@ from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
torch_dtype_to_str, trace_func)
from tensorrt_llm.inputs.multimodal import (MultimodalParams,
MultimodalRuntimeData)
+from tensorrt_llm.inputs.registry import (create_input_processor,
+ create_input_processor_with_hash)
from tensorrt_llm.logger import logger
from tensorrt_llm.lora_helper import LoraConfig
from tensorrt_llm.lora_manager import LoraModelConfig
@@ -34,6 +36,7 @@ from ..compilation.utils import capture_piecewise_cuda_graph
from ..distributed import MPIDist
from ..distributed.communicator import init_pp_comm
from ..expert_statistic import ExpertStatistic
+from ..memory_buffer_utils import with_shared_pool
from ..metadata import KVCacheParams
from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
from ..models.modeling_multimodal_utils import filter_mm_token_from_input_ids
@@ -61,8 +64,6 @@ from .resource_manager import (BaseResourceManager, KVCacheManager,
from .sampler import SampleStateTensors
from .scheduler import ScheduledRequests
-MAX_UINT64 = (1 << 64) - 1
-
class ModelEngine(ABC):
@@ -139,12 +140,14 @@ class PyTorchModelEngine(ModelEngine):
attn_runtime_features: Optional[AttentionRuntimeFeatures] = None,
dist: Optional[MPIDist] = None,
spec_config: Optional["DecodingBaseConfig"] = None,
+ sparse_attention_config: Optional["SparseAttentionConfig"] = None,
lora_config: Optional[LoraConfig] = None,
is_draft_model: bool = False,
drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
torch.nn.Module]] = None,
model: Optional[torch.nn.Module] = None,
):
+ self.forward_pass_callable = None
self.ub_buffers = None
self.batch_size = batch_size
self.max_num_tokens = max_num_tokens
@@ -166,17 +169,21 @@ class PyTorchModelEngine(ModelEngine):
spec_config.max_draft_len = 0
self.spec_config = spec_config
self.is_spec_decode = spec_config is not None
+ self.sparse_attention_config = sparse_attention_config
self.enable_spec_decode = self.is_spec_decode
self.is_draft_model = is_draft_model
self.attn_runtime_features = attn_runtime_features or AttentionRuntimeFeatures(
)
-
+ self.input_processor = create_input_processor(model_path, None)
+ self.input_processor_with_hash = create_input_processor_with_hash(
+ self.input_processor)
if model is None:
loader = ModelLoader(
pytorch_backend_config=pytorch_backend_config,
mapping=self.mapping,
spec_config=self.spec_config,
+ sparse_attention_config=self.sparse_attention_config,
max_num_tokens=max_num_tokens,
max_seq_len=max_seq_len,
lora_config=lora_config,
@@ -263,7 +270,8 @@ class PyTorchModelEngine(ModelEngine):
self.is_warmup = False
self.attn_backend = get_attention_backend(
- pytorch_backend_config.attn_backend)
+ pytorch_backend_config.attn_backend,
+ sparse_attn_config=sparse_attention_config)
if self.is_spec_decode:
self.spec_metadata = None
@@ -351,6 +359,9 @@ class PyTorchModelEngine(ModelEngine):
else:
self.cache_indirection_attention = None
+ def register_forward_pass_callable(self, callable: Callable):
+ self.forward_pass_callable = callable
+
@property
def runtime_draft_len(self):
return self.max_draft_len if self.enable_spec_decode else 0
@@ -446,10 +457,13 @@ class PyTorchModelEngine(ModelEngine):
@with_warmup_flag
def warmup(self, resource_manager: ResourceManager) -> None:
+ """
+ Orchestrates the warmup process by calling specialized warmup methods for
+ torch.compile, the autotuner, and CUDA graphs.
+ """
kv_cache_manager = resource_manager.get_resource_manager(
self.kv_cache_manager_key)
- spec_resource_manager = resource_manager.get_resource_manager(
- ResourceManagerType.SPEC_RESOURCE_MANAGER)
+
if kv_cache_manager is None:
logger.info("Skipping warm up as no KV Cache manager allocated.")
return
@@ -458,317 +472,394 @@ class PyTorchModelEngine(ModelEngine):
# Reset the global cuda graph dummy request to None in warmup.
self.cuda_graph_runner.padding_dummy_request = None
- def get_num_extra_decoding_steps():
- if isinstance(self.model, ChainDrafter):
- return self.model.max_draft_len
- else:
- assert not self.model_is_wrapped, (
- f"Please add logic to determine num_extra_decoding_steps for drafting loop {type(self.model)}"
- )
- return 0
+ # TODO: current warmup_request is not suitable for context parallelism.
+ cp_type = self.mapping.cp_config.get('cp_type', None)
+ if cp_type is not None:
+ logger.info("[ModelEngine::warmup] Skipping warmup for cp_type: ",
+ cp_type.name)
+ return
- def get_cuda_graph_warmup_request(batch_size, draft_len):
- # Divide by max_beam_width to get an approximation of the number of requests that can be run in parallel.
- available_blocks = kv_cache_manager.get_num_free_blocks(
- ) // self.max_beam_width
- if available_blocks >= batch_size:
- result = ScheduledRequests()
- result.context_requests = []
- num_extra_decoding_steps = get_num_extra_decoding_steps()
+ self._run_torch_compile_warmup(resource_manager)
+ self._run_autotuner_warmup(resource_manager)
+ self._run_cuda_graph_warmup(resource_manager)
- # Add (batch_size - 1) dummy requests with seq_len=1.
- # Should only need one more page per request.
- requests = kv_cache_manager.add_dummy_requests(
- list(range(batch_size - 1)),
- is_gen=True,
- max_num_draft_tokens=draft_len,
- use_mrope=self.use_mrope,
- max_beam_width=self.max_beam_width,
- num_extra_decoding_steps=num_extra_decoding_steps)
- # Divide by max_beam_width to get an approximation of the number of tokens that can be added to the final request.
- available_tokens = kv_cache_manager.get_num_available_tokens(
- draft_len)
+ # Set the value back to the original value after all warmups are complete
+ self.enable_spec_decode = self.is_spec_decode
- # Add one dummy request with the maximum possible sequence length.
- # The sequence length is limited by both the max_seq_len and the number of available blocks.
- # Also, the sequence length is limited by the max_position_embeddings.
- token_num = max(1, min(available_tokens, self.max_seq_len - 1))
- model_config = self.model.model_config.pretrained_config
- max_position_embeddings = getattr(model_config,
- 'max_position_embeddings',
- None)
- if max_position_embeddings is not None:
- token_num = min(token_num,
- max_position_embeddings - draft_len)
-
- assert token_num > num_extra_decoding_steps, (
- "Cannot fuse drafting loop. We do not have enough KV cache space "
- "for all of the draft tokens.")
- token_num -= num_extra_decoding_steps
-
- max_seq_len_request = kv_cache_manager.add_dummy_requests(
- request_ids=[batch_size - 1],
- token_nums=[token_num],
- is_gen=True,
- max_num_draft_tokens=draft_len,
- use_mrope=self.use_mrope,
- max_beam_width=self.max_beam_width,
- num_extra_decoding_steps=num_extra_decoding_steps)[0]
- # Add the longest request before all other seq_len=1 request to simulate the padding CUDA graph case.
- # This batch contains both the longest request and the shortest requests,
- # it also contains the maximum number of requests and the maximum token number,
- # which simulates the extreme case for the padding CUDA graph.
- # Thus we can replay this CUDA graph in all other cases.
- requests.insert(0, max_seq_len_request)
- result.generation_requests = requests
- if spec_resource_manager is not None:
- spec_resource_manager.add_dummy_requests(
- request_ids=list(range(batch_size)))
- else:
- result = None
- return result
-
- def get_warmup_request(num_tokens: int, num_gen_tokens: int):
- available_tokens = kv_cache_manager.get_num_available_tokens(
- self.runtime_draft_len)
- available_blocks = kv_cache_manager.get_num_free_blocks()
- if num_tokens > self.max_num_tokens or num_tokens > available_tokens:
- return None
-
- num_extra_decoding_steps = get_num_extra_decoding_steps()
- if num_extra_decoding_steps > 0:
- # Disable autotuning for fused drafting loops for now.
- # There are a few bugs that can cause illegal memory accesses
- # during warmup.
- return None
-
- num_ctx_tokens = num_tokens - num_gen_tokens
- num_ctx_requests = 0
- ctx_requests = []
- gen_requests = []
-
- max_seq_len = self.max_seq_len - 1
- num_full_seqs = 0
- num_left_over_tokens = 0
-
- if num_ctx_tokens > 0:
- # We will try to assign as less context requests as possible to
- # fill the num_ctx_tokens.
-
- # Num full sequences:
- num_full_seqs = num_ctx_tokens // max_seq_len
- num_left_over_tokens = num_ctx_tokens - num_full_seqs * max_seq_len
-
- num_ctx_requests = num_full_seqs + (1 if num_left_over_tokens
- > 0 else 0)
-
- # We do not have enough batch to fill the request
- if num_ctx_requests + num_gen_tokens > self.batch_size:
- return None
-
- blocks_to_use = num_full_seqs * math.ceil(
- max_seq_len / kv_cache_manager.tokens_per_block) + math.ceil(
- num_left_over_tokens /
- kv_cache_manager.tokens_per_block) + num_gen_tokens
-
- if blocks_to_use > available_blocks:
- return None
-
- if num_ctx_tokens > 0:
- ctx_token_nums = [max_seq_len] * num_full_seqs
- if num_left_over_tokens > 0:
- ctx_token_nums.append(num_left_over_tokens)
-
- ctx_requests = kv_cache_manager.add_dummy_requests(
- list(range(num_ctx_requests)),
- token_nums=ctx_token_nums,
- is_gen=False,
- max_num_draft_tokens=self.runtime_draft_len,
- use_mrope=self.use_mrope)
-
- if spec_resource_manager is not None:
- spec_resource_manager.add_dummy_requests(
- request_ids=list(range(num_ctx_requests)))
-
- if num_gen_tokens > 0:
- gen_requests = kv_cache_manager.add_dummy_requests(
- list(
- range(num_ctx_requests,
- num_ctx_requests + num_gen_tokens)),
- token_nums=[1] * num_gen_tokens,
- is_gen=True,
- max_num_draft_tokens=self.max_draft_len,
- use_mrope=self.use_mrope)
- if spec_resource_manager is not None:
- spec_resource_manager.add_dummy_requests(request_ids=list(
- range(num_ctx_requests, num_ctx_requests +
- num_gen_tokens)))
-
- result = ScheduledRequests()
- result.context_requests = ctx_requests
- result.generation_requests = gen_requests
-
- return result
+ def _run_torch_compile_warmup(self, resource_manager: ResourceManager):
+ """Runs warmup iterations to specialize torch.compile kernels."""
+ if not self._torch_compile_enabled:
+ return
+ logger.info("Running torch.compile warmup...")
+ kv_cache_manager = resource_manager.get_resource_manager(
+ self.kv_cache_manager_key)
curr_max_num_tokens = min(
kv_cache_manager.get_num_available_tokens(
self.original_max_draft_len), self.max_num_tokens,
self.batch_size * (self.max_seq_len - 1))
- def get_autotune_warmup_request():
- return get_warmup_request(curr_max_num_tokens, 0)
+ warmup_requests_configs = {
+ (1, 1), # Specialize for 1 token.
+ (self.batch_size,
+ self.batch_size), # max_batch_size, pure generation
+ (2, 0), # Non-one, pure context
+ (curr_max_num_tokens, 0), # max_num_tokens, pure context
+ }
- @contextlib.contextmanager
- def release_batch(result: ScheduledRequests | None):
- try:
- yield result
- finally:
- if result is not None:
- for req in result.all_requests():
- kv_cache_manager.free_resources(req)
- if spec_resource_manager is not None:
- spec_resource_manager.free_resources(req)
+ # Disable cuda graph capture here so that we can properly capture it later
+ with self.no_cuda_graph():
+ for num_tokens, num_gen_tokens in warmup_requests_configs:
+ with self._release_batch_context(
+ self._create_warmup_request(resource_manager,
+ num_tokens, num_gen_tokens),
+ resource_manager) as batch:
+ if batch is None:
+ continue # Not enough KV cache space
+ logger.info(
+ f"Run warmup with {num_tokens} tokens, include {num_gen_tokens} generation tokens"
+ )
+ self.forward(batch,
+ new_tensors_device=None,
+ resource_manager=resource_manager)
+ torch.cuda.synchronize()
- # TODO: current warmup_request is not suitable for star attention
- cp_type = self.mapping.cp_config.get('cp_type', None)
- if cp_type == CpType.STAR:
+ def _run_autotuner_warmup(self, resource_manager: ResourceManager):
+ """Runs a forward pass to populate the autotuner cache."""
+ if not self.pytorch_backend_config.enable_autotuner:
return
- if self._torch_compile_enabled:
+ logger.info("Running autotuner warmup...")
+ kv_cache_manager = resource_manager.get_resource_manager(
+ self.kv_cache_manager_key)
+ curr_max_num_tokens = min(
+ kv_cache_manager.get_num_available_tokens(
+ self.original_max_draft_len), self.max_num_tokens,
+ self.batch_size * (self.max_seq_len - 1))
- warmup_requests = set([
- (1, 1), # Specialize for 1 token.
- (self.batch_size,
- self.batch_size), # max_batch_size, pure generation
- (2, 0), # Non-one, pure context
- (curr_max_num_tokens, 0), # max_num_tokens, pure context
- ])
+ cache_path = os.environ.get("TLLM_AUTOTUNER_CACHE_PATH", None)
+ with self.no_cuda_graph(), autotune(cache_path=cache_path,
+ rank=self.mapping.rank):
+ warmup_request = self._create_warmup_request(
+ resource_manager, curr_max_num_tokens, 0)
+ with self._release_batch_context(warmup_request,
+ resource_manager) as batch:
+ if batch is not None:
+ self.forward(batch,
+ new_tensors_device=None,
+ resource_manager=resource_manager)
+ torch.cuda.synchronize()
- # Disable cuda graph capture here so that we can properly capture it later
- with self.no_cuda_graph():
- for warmup_num_tokens, warmup_num_gen_tokens in warmup_requests:
-
- with release_batch(
- get_warmup_request(warmup_num_tokens,
- warmup_num_gen_tokens)) as batch:
- if batch is None:
- # No KV cache space!
- continue
- logger.info(
- f"Run warmup with {warmup_num_tokens} tokens, include {warmup_num_gen_tokens} generation tokens"
- )
- self.forward(batch,
- new_tensors_device=None,
- resource_manager=resource_manager)
- torch.cuda.synchronize()
-
- if self.pytorch_backend_config.enable_autotuner:
- # handle multiple rank issue
- cache_path = os.environ.get("TLLM_AUTOTUNER_CACHE_PATH", None)
- with self.no_cuda_graph(), autotune(cache_path=cache_path,
- rank=self.mapping.rank):
- result = get_autotune_warmup_request()
- with release_batch(result) as batch:
- if batch is None:
- # No KV cache space!
- pass
- else:
- self.forward(batch,
- new_tensors_device=None,
- resource_manager=resource_manager)
- torch.cuda.synchronize()
-
- logger.info(
- f"[Autotuner] Cache size after warmup is {len(AutoTuner.get().profiling_cache)}"
- )
-
- AutoTuner.get().print_profiling_cache()
+ logger.info(
+ f"[Autotuner] Cache size after warmup is {len(AutoTuner.get().profiling_cache)}"
+ )
+ AutoTuner.get().print_profiling_cache()
+ def _run_cuda_graph_warmup(self, resource_manager: ResourceManager):
+ """Captures CUDA graphs for various batch sizes and draft lengths."""
if not (self.cuda_graph_runner.enabled
or self._torch_compile_piecewise_cuda_graph):
return
+ self._capture_generation_cuda_graphs(resource_manager)
+ self._capture_piecewise_cuda_graphs(resource_manager)
+
+ def _capture_generation_cuda_graphs(self,
+ resource_manager: ResourceManager):
+ """Captures CUDA graphs for pure generation steps."""
+ if not self.cuda_graph_runner.enabled:
+ return
+
logger.info(
f"Creating CUDA graph instances for {len(self._cuda_graph_batch_sizes)} batch sizes."
)
- # Reverse the order of the cuda graph batch sizes to make smaller batch size graph could reuse larger batch size graph memory
+ spec_resource_manager = resource_manager.get_resource_manager(
+ ResourceManagerType.SPEC_RESOURCE_MANAGER)
+
+ # Reverse order so smaller graphs can reuse memory from larger ones
cuda_graph_batch_sizes = sorted(self._cuda_graph_batch_sizes,
reverse=True)
# Create CUDA graphs for different draft lengths
- draft_lengths = [self.max_draft_len]
- # For non-draft model, we also capture the CUDA graph instance for draft length 0,
- # so that when we disable spec decode at runtime, we can still run the captured graph.
- # Note that for one engine mode, we are not able to turn off spec decode at runtime.
- if (not self.is_draft_model and self.max_draft_len > 0
- and not self.spec_config.spec_dec_mode.use_one_engine()
- # Assume that speculation is always on if the user didn't give us a max_concurrency
- # value. This will save on memory.
- and self.spec_config.max_concurrency is not None):
- draft_lengths.append(0)
- if self.is_spec_decode and self.is_draft_model and spec_resource_manager is not None and isinstance(
- spec_resource_manager, Eagle3ResourceManager):
- draft_lengths.append(self.original_max_draft_len)
+ draft_lengths = []
+ if self.is_draft_model:
+ if self.model_is_wrapped and self.is_spec_decode and spec_resource_manager is not None and isinstance(
+ spec_resource_manager, Eagle3ResourceManager):
+ # The CDL path uses draft_len > 0 for the number of iterations in the drafting loop.
+ draft_lengths.append(self.original_max_draft_len)
+ else:
+ draft_lengths.append(self.max_draft_len)
+ else:
+ # For non-draft model, we also capture the CUDA graph instance for draft length 0,
+ # so that when we disable spec decode at runtime, we can still run the captured graph.
+ # Note that for one engine mode, we are not able to turn off spec decode at runtime.
+ if (self.max_draft_len > 0
+ and not self.spec_config.spec_dec_mode.use_one_engine()
+ # Assume that speculation is always on if the user didn't give us a max_concurrency
+ # value. This will save on memory.
+ and self.spec_config.max_concurrency is not None):
+ draft_lengths.append(0)
+ draft_lengths = [self.max_draft_len]
for bs in cuda_graph_batch_sizes:
if bs > self.batch_size:
- # skip batch size larger than self.batch_size
continue
for draft_len in draft_lengths:
- with release_batch(get_cuda_graph_warmup_request(
- bs, draft_len)) as batch:
+ warmup_request = self._create_cuda_graph_warmup_request(
+ resource_manager, bs, draft_len)
+ with self._release_batch_context(warmup_request,
+ resource_manager) as batch:
if batch is None:
- # No KV cache space!
+ # No KV cache space, cannot continue capturing graphs
return
+
logger.info(
- f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
+ f"Run generation-only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
)
+
self.enable_spec_decode = draft_len > 0 or self.is_draft_model
-
- def _update_draft_inference_state(is_first_draft: bool,
- batch: ScheduledRequests):
- if self.is_draft_model and isinstance(
- spec_resource_manager, Eagle3ResourceManager):
- spec_resource_manager.is_first_draft = is_first_draft
- if is_first_draft:
- for req in batch.generation_requests:
- req.py_is_first_draft = True
- # Reset the draft tokens for the first draft inference
- req.py_draft_tokens = []
-
- _update_draft_inference_state(draft_len > 0, batch)
+ self._update_draft_inference_state_for_warmup(
+ batch, draft_len > 0, resource_manager)
self.forward(batch,
new_tensors_device=None,
resource_manager=resource_manager)
torch.cuda.synchronize()
- if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
- piecewise_cuda_graph_num_tokens = sorted(
- self._piecewise_cuda_graph_num_tokens, reverse=True)
+ def _capture_piecewise_cuda_graphs(self, resource_manager: ResourceManager):
+ """Captures piecewise CUDA graphs for context/prefill steps via torch.compile."""
+ if not (self._torch_compile_piecewise_cuda_graph
+ and self._torch_compile_enabled):
+ return
- with capture_piecewise_cuda_graph(True):
- for num_tokens in piecewise_cuda_graph_num_tokens:
- with self.no_cuda_graph():
- with release_batch(get_warmup_request(num_tokens,
- 0)) as batch:
- logger.info(
- f"Run piecewise CUDA graph warmup for num tokens={num_tokens}"
- )
+ logger.info("Running piecewise CUDA graph warmup...")
+ piecewise_cuda_graph_num_tokens = sorted(
+ self._piecewise_cuda_graph_num_tokens, reverse=True)
- for _ in range(3):
- self.forward(batch,
- new_tensors_device=None,
- resource_manager=resource_manager)
- self.forward(batch,
- new_tensors_device=None,
- resource_manager=resource_manager)
- torch.cuda.synchronize()
- gc.collect()
- torch.cuda.empty_cache()
+ with capture_piecewise_cuda_graph(True), self.no_cuda_graph():
+ for num_tokens in piecewise_cuda_graph_num_tokens:
+ warmup_request = self._create_warmup_request(
+ resource_manager, num_tokens, 0)
+ with self._release_batch_context(warmup_request,
+ resource_manager) as batch:
+ if batch is None:
+ continue
- # Set the value back to the original value
- self.enable_spec_decode = self.is_spec_decode
+ logger.info(
+ f"Run piecewise CUDA graph warmup for num tokens={num_tokens}"
+ )
+ # Run a few times to ensure capture
+ for _ in range(3):
+ self.forward(batch,
+ new_tensors_device=None,
+ resource_manager=resource_manager)
+
+ self.forward(batch,
+ new_tensors_device=None,
+ resource_manager=resource_manager)
+ torch.cuda.synchronize()
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ ### Helper methods promoted from the original warmup method ###
+
+ @contextlib.contextmanager
+ def _release_batch_context(self, batch: Optional[ScheduledRequests],
+ resource_manager: ResourceManager):
+ """A context manager to automatically free resources of a dummy batch."""
+ kv_cache_manager = resource_manager.get_resource_manager(
+ self.kv_cache_manager_key)
+ spec_resource_manager = resource_manager.get_resource_manager(
+ ResourceManagerType.SPEC_RESOURCE_MANAGER)
+ try:
+ yield batch
+ finally:
+ if batch is not None and kv_cache_manager is not None:
+ for req in batch.all_requests():
+ kv_cache_manager.free_resources(req)
+ if spec_resource_manager is not None:
+ spec_resource_manager.free_resources(req)
+
+ def _get_num_extra_decoding_steps(self) -> int:
+ """Determines extra decoding steps needed for fused drafting loops."""
+ if isinstance(self.model, ChainDrafter):
+ return self.model.max_draft_len
+ else:
+ assert not self.model_is_wrapped, (
+ f"Please add logic to determine num_extra_decoding_steps for drafting loop {type(self.model)}"
+ )
+ return 0
+
+ def _create_warmup_request(
+ self, resource_manager: ResourceManager, num_tokens: int,
+ num_gen_tokens: int) -> Optional[ScheduledRequests]:
+ """Creates a generic dummy ScheduledRequests object for warmup."""
+ kv_cache_manager = resource_manager.get_resource_manager(
+ self.kv_cache_manager_key)
+ spec_resource_manager = resource_manager.get_resource_manager(
+ ResourceManagerType.SPEC_RESOURCE_MANAGER)
+
+ available_tokens = kv_cache_manager.get_num_available_tokens(
+ self.runtime_draft_len)
+ available_blocks = kv_cache_manager.get_num_free_blocks()
+ if num_tokens > self.max_num_tokens or num_tokens > available_tokens:
+ return None
+
+ num_extra_decoding_steps = self._get_num_extra_decoding_steps()
+ if num_extra_decoding_steps > 0:
+ return None # Disable autotuning for fused drafting loops for now.
+
+ num_ctx_tokens = num_tokens - num_gen_tokens
+ num_ctx_requests = 0
+ ctx_requests = []
+ gen_requests = []
+
+ max_seq_len = self.max_seq_len - 1
+ num_full_seqs = 0
+ num_left_over_tokens = 0
+
+ if num_ctx_tokens > 0:
+ num_full_seqs = num_ctx_tokens // max_seq_len
+ num_left_over_tokens = num_ctx_tokens - num_full_seqs * max_seq_len
+ num_ctx_requests = num_full_seqs + (1 if num_left_over_tokens > 0
+ else 0)
+
+ if num_ctx_requests + num_gen_tokens > self.batch_size:
+ return None # Not enough batch size to fill the request
+
+ blocks_to_use = num_full_seqs * math.ceil(
+ max_seq_len / kv_cache_manager.tokens_per_block) + math.ceil(
+ num_left_over_tokens /
+ kv_cache_manager.tokens_per_block) + num_gen_tokens
+
+ if blocks_to_use > available_blocks:
+ return None
+
+ if num_ctx_tokens > 0:
+ ctx_token_nums = [max_seq_len] * num_full_seqs
+ if num_left_over_tokens > 0:
+ ctx_token_nums.append(num_left_over_tokens)
+
+ ctx_requests = kv_cache_manager.add_dummy_requests(
+ list(range(num_ctx_requests)),
+ token_nums=ctx_token_nums,
+ is_gen=False,
+ max_num_draft_tokens=self.runtime_draft_len,
+ use_mrope=self.use_mrope)
+
+ if spec_resource_manager is not None:
+ spec_resource_manager.add_dummy_requests(
+ request_ids=list(range(num_ctx_requests)))
+
+ if num_gen_tokens > 0:
+ gen_requests = kv_cache_manager.add_dummy_requests(
+ list(range(num_ctx_requests,
+ num_ctx_requests + num_gen_tokens)),
+ token_nums=[1] * num_gen_tokens,
+ is_gen=True,
+ max_num_draft_tokens=self.max_draft_len,
+ use_mrope=self.use_mrope)
+ if spec_resource_manager is not None:
+ spec_resource_manager.add_dummy_requests(request_ids=list(
+ range(num_ctx_requests, num_ctx_requests + num_gen_tokens)))
+
+ result = ScheduledRequests()
+ result.context_requests = ctx_requests
+ result.generation_requests = gen_requests
+ return result
+
+ def _create_cuda_graph_warmup_request(
+ self, resource_manager: ResourceManager, batch_size: int,
+ draft_len: int) -> Optional[ScheduledRequests]:
+ """Creates a dummy ScheduledRequests tailored for CUDA graph capture."""
+ kv_cache_manager = resource_manager.get_resource_manager(
+ self.kv_cache_manager_key)
+ spec_resource_manager = resource_manager.get_resource_manager(
+ ResourceManagerType.SPEC_RESOURCE_MANAGER)
+
+ available_blocks = kv_cache_manager.get_num_free_blocks(
+ ) // self.max_beam_width
+ if available_blocks < batch_size:
+ return None
+
+ result = ScheduledRequests()
+ result.context_requests = []
+ num_extra_decoding_steps = self._get_num_extra_decoding_steps()
+
+ # Add (batch_size - 1) dummy requests with seq_len=1.
+ requests = kv_cache_manager.add_dummy_requests(
+ list(range(batch_size - 1)),
+ is_gen=True,
+ max_num_draft_tokens=draft_len,
+ use_mrope=self.use_mrope,
+ max_beam_width=self.max_beam_width,
+ num_extra_decoding_steps=num_extra_decoding_steps)
+
+ available_tokens = kv_cache_manager.get_num_available_tokens(draft_len)
+
+ # Add one dummy request with the maximum possible sequence length.
+ token_num = max(1, min(available_tokens, self.max_seq_len - 1))
+ model_config = self.model.model_config.pretrained_config
+ max_position_embeddings = getattr(model_config,
+ 'max_position_embeddings', None)
+ if max_position_embeddings is not None:
+ token_num = min(token_num, max_position_embeddings - draft_len)
+
+ assert token_num > num_extra_decoding_steps, (
+ "Cannot fuse drafting loop. Not enough KV cache space for all draft tokens."
+ )
+ token_num -= num_extra_decoding_steps
+
+ max_seq_len_request = kv_cache_manager.add_dummy_requests(
+ request_ids=[batch_size - 1],
+ token_nums=[token_num],
+ is_gen=True,
+ max_num_draft_tokens=draft_len,
+ use_mrope=self.use_mrope,
+ max_beam_width=self.max_beam_width,
+ num_extra_decoding_steps=num_extra_decoding_steps)[0]
+
+ # Insert the longest request first to simulate padding for the CUDA graph.
+ requests.insert(0, max_seq_len_request)
+ result.generation_requests = requests
+ if spec_resource_manager is not None:
+ spec_resource_manager.add_dummy_requests(
+ request_ids=list(range(batch_size)))
+ return result
+
+ def _get_cuda_graph_draft_lengths(
+ self, resource_manager: ResourceManager) -> List[int]:
+ """Determines the draft lengths for which to capture CUDA graphs."""
+ draft_lengths = [self.max_draft_len]
+ spec_resource_manager = resource_manager.get_resource_manager(
+ ResourceManagerType.SPEC_RESOURCE_MANAGER)
+
+ # For non-draft model, also capture a graph for draft_len=0
+ if (not self.is_draft_model and self.max_draft_len > 0
+ and not self.spec_config.spec_dec_mode.use_one_engine()
+ and self.spec_config.max_concurrency is not None):
+ draft_lengths.append(0)
+
+ # Special case for Eagle3 draft model
+ if (self.is_spec_decode and self.is_draft_model
+ and isinstance(spec_resource_manager, Eagle3ResourceManager)):
+ draft_lengths.append(self.original_max_draft_len)
+
+ return list(set(draft_lengths)) # Use set to remove duplicates
+
+ def _update_draft_inference_state_for_warmup(
+ self, batch: ScheduledRequests, is_first_draft: bool,
+ resource_manager: ResourceManager):
+ """Updates request states for specific draft model warmups like Eagle3."""
+ spec_resource_manager = resource_manager.get_resource_manager(
+ ResourceManagerType.SPEC_RESOURCE_MANAGER)
+ if self.is_draft_model and isinstance(spec_resource_manager,
+ Eagle3ResourceManager):
+ spec_resource_manager.is_first_draft = is_first_draft
+ if is_first_draft:
+ for req in batch.generation_requests:
+ req.py_is_first_draft = True
+ req.py_draft_tokens = []
def _set_up_attn_metadata(self, kv_cache_manager: KVCacheManager):
enable_context_mla_with_cached_kv = is_mla(
@@ -787,7 +878,8 @@ class PyTorchModelEngine(ModelEngine):
enable_flash_mla=self.model.model_config.enable_flash_mla,
enable_context_mla_with_cached_kv=
enable_context_mla_with_cached_kv,
- cache_indirection=cache_indirection)
+ cache_indirection=cache_indirection,
+ sparse_attention_config=self.sparse_attention_config)
if self.attn_metadata is not None:
# This assertion can be relaxed if needed: just create a new metadata
@@ -804,7 +896,8 @@ class PyTorchModelEngine(ModelEngine):
runtime_features=self.attn_runtime_features,
enable_flash_mla=self.model.model_config.enable_flash_mla,
enable_context_mla_with_cached_kv=enable_context_mla_with_cached_kv,
- cache_indirection=cache_indirection)
+ cache_indirection=cache_indirection,
+ sparse_attention_config=self.sparse_attention_config)
return self.attn_metadata
@@ -1139,6 +1232,7 @@ class PyTorchModelEngine(ModelEngine):
prompt_lengths.append(len(prompt_tokens))
past_seen_token_num = begin_compute
num_cached_tokens_per_seq.append(past_seen_token_num)
+ request.cached_tokens = num_cached_tokens_per_seq[-1]
# Multimodal
py_multimodal_runtime = MultimodalRuntimeData(
@@ -1249,6 +1343,7 @@ class PyTorchModelEngine(ModelEngine):
range(past_seen_token_num,
past_seen_token_num + 1 + num_draft_tokens)))
num_cached_tokens_per_seq.append(past_seen_token_num)
+ request.cached_tokens = num_cached_tokens_per_seq[-1]
# update batch index
request.py_batch_idx = request.py_seq_slot
else:
@@ -1282,6 +1377,7 @@ class PyTorchModelEngine(ModelEngine):
else:
num_cached_tokens_per_seq.append(past_seen_token_num +
self.runtime_draft_len + 1)
+ request.cached_tokens = num_cached_tokens_per_seq[-1]
if self.enable_spec_decode and spec_config.spec_dec_mode.extend_ctx(
self.attn_backend):
prompt_lengths.append(1 + self.runtime_draft_len)
@@ -1334,8 +1430,15 @@ class PyTorchModelEngine(ModelEngine):
if beam == first_beam:
previous_batch_indices.append(request.py_batch_idx)
past_seen_token_num = request.max_beam_num_tokens
- position_ids.append(past_seen_token_num)
+ position_id = past_seen_token_num
+ if self.mapping.has_cp_helix():
+ # Do an allgather among CP ranks to get the complete sequence length seen by all CP ranks.
+ past_seen_token_nums = self.dist.cp_allgather(
+ past_seen_token_num)
+ position_id = sum(past_seen_token_nums)
+ position_ids.append(position_id)
num_cached_tokens_per_seq.append(past_seen_token_num)
+ request.cached_tokens = num_cached_tokens_per_seq[-1]
prompt_lengths.append(request.py_prompt_len)
draft_lens.append(0)
sequence_lengths.append(1)
@@ -1858,6 +1961,7 @@ class PyTorchModelEngine(ModelEngine):
sequence_lengths.append(len(input_id))
block_ids_per_seq.extend([all_cache_indices])
num_cached_tokens_per_seq.append(past_seen_token_num)
+ request.cached_tokens = num_cached_tokens_per_seq[-1]
num_contexts = len(sequence_lengths)
for request in scheduled_requests.context_requests:
ctx_iter = request.ctx_iters
@@ -1897,6 +2001,7 @@ class PyTorchModelEngine(ModelEngine):
sequence_lengths.append(len(input_id))
block_ids_per_seq.extend([all_cache_indices])
num_cached_tokens_per_seq.append(past_seen_token_num)
+ request.cached_tokens = num_cached_tokens_per_seq[-1]
num_queries = len(sequence_lengths) - num_contexts
# Requests with draft tokens are treated like extend requests.
@@ -1954,6 +2059,7 @@ class PyTorchModelEngine(ModelEngine):
position_ids.append(last_query_pos_id + request.gen_iters + 1)
block_ids_per_seq.extend([all_cache_indices])
num_cached_tokens_per_seq.append(past_seen_token_num)
+ request.cached_tokens = num_cached_tokens_per_seq[-1]
num_tokens = len(input_ids)
assert num_tokens <= self.max_num_tokens, (
@@ -2111,13 +2217,17 @@ class PyTorchModelEngine(ModelEngine):
if CpType.STAR == cp_type:
return self._prepare_star_attention_inputs(
scheduled_requests, kv_cache_manager, attn_metadata)
+ elif CpType.HELIX == cp_type:
+ # Take the usual route of _prepare_tp_inputs.
+ pass
else:
- assert False, f'Unsupport cp_type {cp_type}'
- else:
- return self._prepare_tp_inputs(scheduled_requests, kv_cache_manager,
- attn_metadata, spec_metadata,
- new_tensors_device,
- cache_indirection_buffer)
+ raise NotImplementedError(
+ f"Unsupported cp_type {getattr(cp_type, 'name', cp_type)}.")
+
+ return self._prepare_tp_inputs(scheduled_requests, kv_cache_manager,
+ attn_metadata, spec_metadata,
+ new_tensors_device,
+ cache_indirection_buffer)
@torch.inference_mode()
@with_model_extra_attrs(lambda self: self.model.extra_attrs)
@@ -2186,35 +2296,38 @@ class PyTorchModelEngine(ModelEngine):
new_tensors_device, cache_indirection_buffer)
self.iter_counter += 1
-
- if not maybe_graph:
- # Fallback to eager execution if graph was not used
- with MoeLoadBalancerIterContext(moe_load_balancer):
- outputs = self._forward_step(inputs, gather_ids,
- gather_context_logits)
- else:
- if self.cuda_graph_runner.needs_capture(key):
-
- def capture_forward_fn(inputs: Dict[str, Any]):
- with MoeLoadBalancerIterContext(moe_load_balancer):
- return self._forward_step(
- inputs,
- gather_ids=gather_ids,
- gather_context_logits=gather_context_logits)
-
- def capture_postprocess_fn(inputs: Dict[str, Any]):
- self._postprocess_inputs(inputs)
-
- self.cuda_graph_runner.capture(key, capture_forward_fn,
- inputs,
- capture_postprocess_fn)
-
- # here we don't need to use context since cuda graph capture didn't run kernel.
- # maybe we need a cleaner way to do this.
- outputs = self.cuda_graph_runner.replay(key, inputs)
- else:
+ with with_shared_pool(self.cuda_graph_runner.get_graph_pool()):
+ if not maybe_graph:
+ # Fallback to eager execution if graph was not used
with MoeLoadBalancerIterContext(moe_load_balancer):
+ outputs = self._forward_step(inputs, gather_ids,
+ gather_context_logits)
+ else:
+ if self.cuda_graph_runner.needs_capture(key):
+
+ def capture_forward_fn(inputs: Dict[str, Any]):
+ with MoeLoadBalancerIterContext(moe_load_balancer):
+ return self._forward_step(
+ inputs,
+ gather_ids=gather_ids,
+ gather_context_logits=gather_context_logits)
+
+ def capture_postprocess_fn(inputs: Dict[str, Any]):
+ self._postprocess_inputs(inputs)
+
+ self.cuda_graph_runner.capture(key, capture_forward_fn,
+ inputs,
+ capture_postprocess_fn)
+
+ # here we don't need to use context since cuda graph capture didn't run kernel.
+ # maybe we need a cleaner way to do this.
outputs = self.cuda_graph_runner.replay(key, inputs)
+ else:
+ with MoeLoadBalancerIterContext(moe_load_balancer):
+ outputs = self.cuda_graph_runner.replay(key, inputs)
+
+ if self.forward_pass_callable is not None:
+ self.forward_pass_callable()
self._execute_logit_post_processors(scheduled_requests, outputs)
@@ -2247,21 +2360,34 @@ class PyTorchModelEngine(ModelEngine):
inputs = self._preprocess_inputs(inputs)
if inputs.get('spec_metadata', None):
gather_ids = inputs['spec_metadata'].gather_ids
- if self.without_logits:
- outputs = self.model_forward(**inputs)
- return outputs
# For simplicity, just return all the the logits if we have special gather_ids
# from speculative decoding.
- logits = self.model_forward(
+ outputs = self.model_forward(
**inputs,
return_context_logits=gather_ids is not None
or gather_context_logits,
)
- if gather_ids is not None:
- return {'logits': logits[gather_ids]}
+
+ if self.without_logits:
+ return outputs
+
+ if isinstance(outputs, dict):
+ # If the model returns a dict, get the logits from it. All other keys are kept.
+ logits = outputs.get('logits', None)
+ # If the logits are not found, no further processing is needed.
+ if logits is None:
+ return outputs
else:
- return {'logits': logits}
+ # If the model returns a single tensor, assume it is the logits and wrap it in a dict.
+ logits = outputs
+ outputs = {'logits': logits}
+
+ # If we have special gather_ids, gather the logits
+ if gather_ids is not None:
+ outputs['logits'] = logits[gather_ids]
+
+ return outputs
@nvtx_range("_forward_step_mm_encoder_only")
def _forward_step_mm_encoder_only(
diff --git a/latest/_modules/index.html b/latest/_modules/index.html
index 3ab4a677d2..59fd5b04e5 100644
--- a/latest/_modules/index.html
+++ b/latest/_modules/index.html
@@ -58,7 +58,7 @@
@@ -68,7 +68,7 @@
-
+
@@ -330,6 +330,7 @@
@@ -511,6 +515,7 @@
fromtqdmimporttqdmfromtransformersimportPreTrainedTokenizerBase
+fromtensorrt_llm._utilsimportmpi_disabledfromtensorrt_llm.inputs.dataimportTextPromptfromtensorrt_llm.inputs.multimodalimportMultimodalInput,MultimodalParamsfromtensorrt_llm.inputs.registryimportDefaultInputProcessor
@@ -628,6 +633,7 @@
**kwargs:Any)->None:self._executor_cls=kwargs.pop("executor_cls",GenerationExecutor)
+ self._orchestrator_type=kwargs.get("orchestrator_type",None)self._llm_id=Nonelog_level=logger.level
@@ -638,6 +644,12 @@
ifbackend=="pytorch":logger.info("Using LLM with PyTorch backend")llm_args_cls=TorchLlmArgs
+ ifself._orchestrator_type=="ray"ormpi_disabled():
+ self._orchestrator_type="ray"
+ os.environ["TLLM_DISABLE_MPI"]="1"
+ # Propagate to args construction
+ kwargs["orchestrator_type"]="ray"
+
elifbackend=='_autodeploy':logger.info("Using LLM with AutoDeploy backend")from.._torch.auto_deploy.llm_argsimport \
@@ -758,6 +770,7 @@
DisaggregatedParams,Sequence[DisaggregatedParams]]]=None,scheduling_params:Optional[Union[SchedulingParams,List[SchedulingParams]]]=None,
+ cache_salt:Optional[Union[str,Sequence[str]]]=None,)->Union[RequestOutput,List[RequestOutput]]:"""Generate output for the given prompts in the synchronous mode. Synchronous generation accepts either single prompt or batched prompts.
@@ -778,6 +791,7 @@
Disaggregated parameters. Defaults to None. scheduling_params (tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], optional): Scheduling parameters. Defaults to None.
+ cache_salt (str, Sequence[str], optional): If specified, KV cache will be salted with the provided string to limit the kv cache reuse to the requests with the same string. Defaults to None. Returns: Union[tensorrt_llm.llmapi.RequestOutput, List[tensorrt_llm.llmapi.RequestOutput]]: The output data of the completion request to the LLM. """
@@ -808,7 +822,9 @@
i),disaggregated_params=_item_at(disaggregated_params,i),scheduling_params=_item_at(scheduling_params,i),
- streaming=False)
+ cache_salt=_item_at(cache_salt,i),
+ streaming=False,
+ )futures.append(future)forfutureintqdm(futures,
@@ -1102,10 +1118,6 @@
is_gen_only:bool)->None:ifself.args.backendin["pytorch","_autodeploy"]:
- ifsampling_params.logprobsandsampling_params.logprobs>1:
- raiseValueError(
- f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
- )# Check prompt length and query length against max_num_tokens to filter illegal requests.# Skip check for gen-only requestsifself.args.backend=="pytorch"andnotself.args.enable_chunked_prefillandnotis_gen_only:
@@ -1450,8 +1462,7 @@
num_postprocess_workers=self.args.num_postprocess_workers,postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,),
- is_llm_executor=True,
- lora_config=lora_config)
+ is_llm_executor=True)@append_docstring(TORCH_LLM_DOCSTRING)
@@ -1492,6 +1503,34 @@
backend=backend,**kwargs)
+ @set_api_status("prototype")
+ def_collective_rpc(self,
+ method:str,
+ args:tuple[Any,...]=(),
+ kwargs:Optional[dict]=None,
+ non_block:bool=False,
+ unique_reply_rank:Optional[int]=None)->list[Any]:
+"""
+ Execute an RPC call on all GPU workers. Currently, this is only supported for RayExecutor.
+
+ Args:
+ method (str): The name of the worker method to execute.
+ args (tuple[Any, ...]): Positional arguments to pass to the worker method. Defaults to ().
+ kwargs (dict, optional): Keyword arguments to pass to the worker method. Defaults to None.
+ non_block (bool): Whether to block until all workers have completed the RPC call. Defaults to False.
+ unique_reply_rank (int, optional): The rank of the worker that will be used to send the reply. Defaults to None.
+
+ Returns:
+ list[Any]: A list of results from each worker.
+ """
+ ifhasattr(self._executor,'collective_rpc'):
+ returnself._executor.collective_rpc(method,args,kwargs,
+ non_block,unique_reply_rank)
+ else:
+ raiseValueError(
+ f"Executor type {type(self._executor)} does not support collective RPC."
+ )
+
def_build_model(self):super()._build_model()assertself._engine_dirisNone
@@ -1525,9 +1564,6 @@
postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,),is_llm_executor=True,
- lora_config=self.args.lora_config,
- # Autodeploy does not support kv_connector_config
- kv_connector_config=getattr(self.args,"kv_connector_config",None),hf_model_dir=self._hf_model_dir,tokenizer=self.tokenizer,llm_args=self.args)
@@ -1697,9 +1733,9 @@
diff --git a/latest/_modules/tensorrt_llm/llmapi/llm_args.html b/latest/_modules/tensorrt_llm/llmapi/llm_args.html
index 5b155fd7ed..c1161ad791 100644
--- a/latest/_modules/tensorrt_llm/llmapi/llm_args.html
+++ b/latest/_modules/tensorrt_llm/llmapi/llm_args.html
@@ -58,7 +58,7 @@
@@ -68,7 +68,7 @@
-
+
@@ -330,6 +330,7 @@
[docs]classMoeConfig(StrictBaseModel):
@@ -890,7 +960,39 @@
max_concurrency:Optional[int]=Noneload_format:Optional[str]=None
+ # PyTorch only.
+ # Rolling average window size (N) for acceptance length across completed requests.
+ # If not set or set to 0, the feature is disabled.
+ acceptance_window:Optional[int]=None
+ # PyTorch only.
+ # Threshold for average acceptance length; speculation will be disabled
+ # permanently once the rolling average over the last N completed requests
+ # (N = acceptance_window) drops below this value.
+ acceptance_length_threshold:Optional[float]=None
+ # Validate acceptance controls at field level so they run on model creation
+ @field_validator('acceptance_window')
+ @classmethod
+ def_validate_acceptance_window(cls,v:Optional[int]):
+ ifvisNone:
+ returnv
+ ifv<0:
+ raiseValueError(
+ f"acceptance_window must be >= 0 (0 disables), got {v}")
+ returnv
+
+ @field_validator('acceptance_length_threshold')
+ @classmethod
+ def_validate_acceptance_length_threshold(cls,v:Optional[float]):
+ ifvisNone:
+ returnv
+ ifv<0:
+ raiseValueError(
+ f"acceptance_length_threshold must be >= 0, got {v}")
+ returnv
+
+ # If set, drafting is allowed to use chain drafter.
+ _allow_chain_drafter:bool=PrivateAttr(True)# If set, drafting uses greedy sampling, irrespective of sampling parameters._allow_greedy_draft_tokens:bool=PrivateAttr(True)
@@ -905,6 +1007,7 @@
"Lookahead":LookaheadDecodingConfig,"NGram":NGramDecodingConfig,"DraftTarget":DraftTargetDecodingConfig,
+ "SaveState":SaveHiddenStatesDecodingConfig,"UserProvided":UserProvidedDecodingConfig,"AUTO":AutoDecodingConfig,}
@@ -1111,6 +1214,64 @@
+
+[docs]
+ defvalidate(self)->None:
+ ifself.output_directoryisNoneornotself.eagle3_layers_to_capture:
+ raiseValueError(
+ "Save directory and layers to capture must be provided")
+
+
+ @functools.cached_property
+ defspec_dec_mode(self):
+ fromtensorrt_llm._torch.speculative.interfaceimport \
+ SpeculativeDecodingModeasTorchSpeculativeDecodingMode
+ returnTorchSpeculativeDecodingMode.SAVE_HIDDEN_STATES
+
+ @functools.cached_property
+ defnum_capture_layers(self):
+"""
+ Returns the number of layers to capture of the target model.
+ If eagle3_layers_to_capture is not None, return the length of the set.
+ Otherwise, assume Eagle3 base set and return 3 + 1 (for post norm last hidden state).
+ """
+ ifself.eagle3_layers_to_captureisNone:
+ return4
+ returnlen(self.eagle3_layers_to_capture)
[docs]
-@dataclass(slots=True)
-classPluginConfig(metaclass=PluginConfigMeta):
+classPluginConfig(BaseModel):"""The config that manages plugin-related options. There are two option categories:
@@ -649,356 +600,291 @@
* Other features. These options can be assigned with boolean: * True, which means the plugin is enabled; * False, which means the plugin is disabled.
-
- Note: All the fields should use a prefix "_"; PluginConfigMeta will wrap each field as a property.
- This ensures the fields can only be assigned with allowed values. """
- _dtype:str=field(default="float16",init=False)
+ model_config=ConfigDict(validate_assignment=True,extra="ignore")
+
+ dtype:str=Field(default="float16",
+ description="Base dtype for the model and plugins")# Plugins
- _bert_attention_plugin:Optional[str]=field(
+ bert_attention_plugin:Optional[DefaultPluginDtype]=Field(default="auto",
- init=False,
- metadata={
- "help":
- "The plugin that uses efficient kernels and enables an in-place update of the KV cache for attention layer of BERT-like encoder models."
- })
- _gpt_attention_plugin:Optional[str]=field(
+ description=
+ "The plugin that uses efficient kernels and enables an in-place update of the KV cache for attention layer of BERT-like encoder models."
+ )
+ gpt_attention_plugin:Optional[DefaultPluginDtype]=Field(default="auto",
- init=False,
- metadata={
- "help":
- "The plugin that uses efficient kernels and enables an in-place update of the KV cache for attention layer of GPT-like decoder models."
- })
- _gemm_plugin:Optional[str]=field(
- default=None,
- init=False,
- metadata={
- "help":
+ description=
+ "The plugin that uses efficient kernels and enables an in-place update of the KV cache for attention layer of GPT-like decoder models."
+ )
+ gemm_plugin:Optional[Literal[
+ "auto","float16","float32","bfloat16","int32","fp8","nvfp4",
+ None]]=Field(
+ default=None,
+ description="The GEMM plugin that utilizes NVIDIA cuBLASLt to perform GEMM operations. ""Note: it's only affective for non-quantized gemm operations (except FP8)."
- "Note: For FP8, it also requires same calibration in checkpoint."
- })
- _explicitly_disable_gemm_plugin:bool=False
- _gemm_swiglu_plugin:Optional[str]=field(
+ "Note: For FP8, it also requires same calibration in checkpoint.")
+ _explicitly_disable_gemm_plugin:bool=PrivateAttr(default=False)
+ gemm_swiglu_plugin:Optional[Literal["fp8",None]]=Field(default=None,
- init=False,
- metadata={
- "help":
- "The GEMM + SwiGLU fusion in Gated-MLP combines two Matmul operations and "
- "one SwiGLU operation into a single kernel. Currently this is only supported for FP8 precision on Hopper."
- })
- _fp8_rowwise_gemm_plugin:Optional[str]=field(
+ description=
+ "The GEMM + SwiGLU fusion in Gated-MLP combines two Matmul operations and "
+ "one SwiGLU operation into a single kernel. Currently this is only supported for FP8 precision on Hopper."
+ )
+ fp8_rowwise_gemm_plugin:Optional[DefaultPluginDtype]=Field(default=None,
- init=False,
- metadata={
- "help":
- "The quantized GEMM for fp8, which uses per token dynamic scales for "
- "activation and per channel static scales for weights."
- "Note: It also requires same calibration in checkpoint."
- })
- _qserve_gemm_plugin:Optional[str]=field(
+ description=
+ "The quantized GEMM for fp8, which uses per token dynamic scales for "
+ "activation and per channel static scales for weights."
+ "Note: It also requires same calibration in checkpoint.")
+ qserve_gemm_plugin:Optional[DefaultPluginDtype]=Field(default=None,
- init=False,
- metadata={
- "help":
- "The quantized GEMM from [QServe](https://arxiv.org/abs/2405.04532), "
- "which employs 4-bit quantization for weights and 8-bit quantization for activations."
- })
- _identity_plugin:Optional[str]=field(
+ description=
+ "The quantized GEMM from [QServe](https://arxiv.org/abs/2405.04532), "
+ "which employs 4-bit quantization for weights and 8-bit quantization for activations."
+ )
+ identity_plugin:Optional[DefaultPluginDtype]=Field(default=None,
- init=False,
- metadata={
- "help":
- "The identity plugin simply copies inputs to outputs, it's used mostly for debugging purpose."
- })
- _nccl_plugin:Optional[str]=field(
+ description=
+ "The identity plugin simply copies inputs to outputs, it's used mostly for debugging purpose."
+ )
+ nccl_plugin:Optional[DefaultPluginDtype]=Field(default="auto",
- init=False,
- metadata={
- "help":
- "The NCCL plugin wraps NCCL operators to support multi-GPU and even multi-nodes."
- })
- _lora_plugin:Optional[str]=field(default=None,
- init=False,
- metadata={"help":"Enable LoRA."})
- _dora_plugin:bool=field(default=False,
- init=False,
- metadata={"help":"Enable DoRA."})
- _weight_only_groupwise_quant_matmul_plugin:Optional[str]=field(
+ description=
+ "The NCCL plugin wraps NCCL operators to support multi-GPU and even multi-nodes."
+ )
+ lora_plugin:Optional[DefaultPluginDtype]=Field(
+ default=None,description="Enable LoRA.")
+ dora_plugin:bool=Field(default=False,description="Enable DoRA.")
+ weight_only_groupwise_quant_matmul_plugin:Optional[
+ DefaultPluginDtype]=Field(
+ default=None,
+ description=
+ "Enable weight-only groupwise quantization matmul operators.")
+ weight_only_quant_matmul_plugin:Optional[DefaultPluginDtype]=Field(default=None,
- init=False,
- metadata={
- "help":
- "Enable weight-only groupwise quantization matmul operators."
- })
- _weight_only_quant_matmul_plugin:Optional[str]=field(
- default=None,
- init=False,
- metadata={"help":"Enable weight-only quantization matmul operators."})
- _smooth_quant_plugins:bool=field(
+ description="Enable weight-only quantization matmul operators.")
+ smooth_quant_plugins:bool=Field(default=True,
- init=False,
- metadata={
- "help":"Enable a group of plugins to support smooth quantization."
- })
- _smooth_quant_gemm_plugin:Optional[str]=field(
+ description="Enable a group of plugins to support smooth quantization.")
+ smooth_quant_gemm_plugin:Optional[DefaultPluginDtype]=Field(default=None,
- init=False,
- metadata={
- "help":
- "Enable plugin that supports smooth quantization gemm kernels."
- })
- _layernorm_quantization_plugin:Optional[str]=field(
+ description=
+ "Enable plugin that supports smooth quantization gemm kernels.")
+ layernorm_quantization_plugin:Optional[DefaultPluginDtype]=Field(default=None,
- init=False,
- metadata={
- "help":
- "Enable plugin that supports layernorm quantization kernels."
- })
- _rmsnorm_quantization_plugin:Optional[str]=field(
+ description="Enable plugin that supports layernorm quantization kernels."
+ )
+ rmsnorm_quantization_plugin:Optional[DefaultPluginDtype]=Field(default=None,
- init=False,
- metadata={
- "help":"Enable plugin that supports rmsnorm quantization kernels."
- })
- _quantize_per_token_plugin:bool=field(
+ description="Enable plugin that supports rmsnorm quantization kernels.")
+ quantize_per_token_plugin:bool=Field(default=False,
- init=False,
- metadata={
- "help":"Enable plugin that supports per-token quantization."
- })
- _quantize_tensor_plugin:bool=field(
+ description="Enable plugin that supports per-token quantization.")
+ quantize_tensor_plugin:bool=Field(default=False,
- init=False,
- metadata={
- "help":"Enable plugin that supports per-tensor quantization."
- })
- _moe_plugin:Optional[str]=field(
+ description="Enable plugin that supports per-tensor quantization.")
+ moe_plugin:Optional[DefaultPluginDtype]=Field(default="auto",
- init=False,
- metadata={
- "help":
- "Enable some customized kernels to speed up the MoE layer of MoE models."
- })
- _mamba_conv1d_plugin:Optional[str]=field(
+ description=
+ "Enable some customized kernels to speed up the MoE layer of MoE models."
+ )
+ mamba_conv1d_plugin:Optional[DefaultPluginDtype]=Field(default="auto",
- init=False,
- metadata={
- "help":
- "Enable customized kernels to speed up conv1d operator for Mamba."
- })
- _low_latency_gemm_plugin:Optional[str]=field(
+ description=
+ "Enable customized kernels to speed up conv1d operator for Mamba.")
+ low_latency_gemm_plugin:Optional[Literal["fp8",None]]=Field(default=None,
- init=False,
- metadata={
- "help":
- "The GEMM plugin that optimized specially for low latency scenarios."
- })
- _low_latency_gemm_swiglu_plugin:Optional[str]=field(
+ description=
+ "The GEMM plugin that optimized specially for low latency scenarios.")
+ low_latency_gemm_swiglu_plugin:Optional[Literal["fp8",None]]=Field(default=None,
- init=False,
- metadata={
- "help":
- "The GEMM + SwiGLU fusion plugin that optimized specially for low latency scenarios."
- })
-
- _gemm_allreduce_plugin:Optional[str]=field(
- default=None,
- init=False,
- metadata={"help":"The GEMM + AllReduce kernel fusion plugin."})
+ description=
+ "The GEMM + SwiGLU fusion plugin that optimized specially for low latency scenarios."
+ )
+ gemm_allreduce_plugin:Optional[Literal[
+ "float16","bfloat16",
+ None]]=Field(default=None,
+ description="The GEMM + AllReduce kernel fusion plugin.")# Features
- _context_fmha:bool=field(
+ context_fmha:bool=Field(default=True,
- init=False,
- metadata={
- "help":
- "Enable the fused multi-head attention during the context phase, "
- "will trigger a kernel that performs the MHA/MQA/GQA block using a single kernel."
- })
- _bert_context_fmha_fp32_acc:bool=field(
+ description=
+ "Enable the fused multi-head attention during the context phase, "
+ "will trigger a kernel that performs the MHA/MQA/GQA block using a single kernel."
+ )
+ bert_context_fmha_fp32_acc:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Enable the FP32 accumulator for context FMHA in the bert_attention_plugin. "
- "If disabled, FP16 is used, better performance but potentially worse accuracy is expected."
- })
- _paged_kv_cache:Optional[bool]=field(
+ description=
+ "Enable the FP32 accumulator for context FMHA in the bert_attention_plugin. "
+ "If disabled, FP16 is used, better performance but potentially worse accuracy is expected."
+ )
+ paged_kv_cache:Optional[bool]=Field(default=None,
- init=False,
- metadata={
- "help":
- "Enable paged KV cache, which helps manage memory for the KV cache more efficiently, "
- "and usually leads to an increase in the batch size and an improved efficiency."
- })
- _remove_input_padding:bool=field(
+ description=
+ "Enable paged KV cache, which helps manage memory for the KV cache more efficiently, "
+ "and usually leads to an increase in the batch size and an improved efficiency."
+ )
+ remove_input_padding:bool=Field(default=True,
- init=False,
- metadata={
- "help":
- "Pack different tokens together, which reduces both the amount of computations and memory consumption."
- })
- _norm_quant_fusion:bool=field(
+ description=
+ "Pack different tokens together, which reduces both the amount of computations and memory consumption."
+ )
+ norm_quant_fusion:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Fuse the LayerNorm and quantization kernels into a single kernel, "
- "resulting in improved end-to-end performance."
- })
- _reduce_fusion:bool=field(
+ description=
+ "Fuse the LayerNorm and quantization kernels into a single kernel, "
+ "resulting in improved end-to-end performance.")
+ reduce_fusion:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Fuse the ResidualAdd and LayerNorm kernels after AllReduce into a single kernel, "
- "resulting in improved end-to-end performance."
- })
- _user_buffer:bool=field(
+ description=
+ "Fuse the ResidualAdd and LayerNorm kernels after AllReduce into a single kernel, "
+ "resulting in improved end-to-end performance.")
+ user_buffer:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Eliminate extra copies from the local buffer to the shared buffer "
- "in the communication kernel, leading to improved end-to-end performance. "
- "This feature must be enabled with `--reduce_fusion enable` and "
- "is currently only supported for the FP8 LLAMA model."
- })
- _tokens_per_block:int=field(
+ description=
+ "Eliminate extra copies from the local buffer to the shared buffer "
+ "in the communication kernel, leading to improved end-to-end performance. "
+ "This feature must be enabled with `--reduce_fusion enable` and "
+ "is currently only supported for the FP8 LLAMA model.")
+ tokens_per_block:int=Field(default=32,
- init=False,
- metadata={
- "help":
- "Define how many tokens are contained in each paged kv cache block."
- })
- _use_paged_context_fmha:bool=field(
+ description=
+ "Define how many tokens are contained in each paged kv cache block.")
+ use_paged_context_fmha:bool=Field(default=True,
- init=False,
- metadata={
- "help":
- "Allow advanced features like KV cache reuse and chunked context."
- })
- _use_fp8_context_fmha:bool=field(
+ description=
+ "Allow advanced features like KV cache reuse and chunked context.")
+ use_fp8_context_fmha:bool=Field(default=True,
- init=False,
- metadata={
- "help":
- "When FP8 quantization is activated, the attention can be further accelerated by enabling FP8 Context FMHA"
- })
- _fuse_fp4_quant:bool=field(
+ description=
+ "When FP8 quantization is activated, the attention can be further accelerated by enabling FP8 Context FMHA"
+ )
+ fuse_fp4_quant:bool=Field(default=False,
- init=False,
- metadata={
- "help":"Whether to fuse FP4 quantization into attention kernel."
- })
- _multiple_profiles:bool=field(
+ description="Whether to fuse FP4 quantization into attention kernel.")
+ multiple_profiles:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Enables multiple TensorRT optimization profiles in the built engines, "
- "will benefits the performance especially when GEMM plugin is disabled, "
- "because more optimization profiles help TensorRT have more chances to select better kernels. "
- "Note: This feature increases engine build time but no other adverse effects are expected."
- })
- _paged_state:bool=field(
+ description=
+ "Enables multiple TensorRT optimization profiles in the built engines, "
+ "will benefits the performance especially when GEMM plugin is disabled, "
+ "because more optimization profiles help TensorRT have more chances to select better kernels. "
+ "Note: This feature increases engine build time but no other adverse effects are expected."
+ )
+ paged_state:bool=Field(default=True,
- init=False,
- metadata={
- "help":
- "Enable paged state, which helps manage memory for the RNN state more efficiently."
- })
- _streamingllm:bool=field(
+ description=
+ "Enable paged state, which helps manage memory for the RNN state more efficiently."
+ )
+ streamingllm:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Enable [StreamingLLM](https://arxiv.org/abs/2309.17453), which uses a window attention to perform efficient and stable LLM on long texts."
- })
- _manage_weights:bool=field(
+ description=
+ "Enable [StreamingLLM](https://arxiv.org/abs/2309.17453), which uses a window attention to perform efficient and stable LLM on long texts."
+ )
+ manage_weights:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Enable TensorRT LLM managed weights to speed up engine building process."
- })
- _use_fused_mlp:bool=field(
+ description=
+ "Enable TensorRT LLM managed weights to speed up engine building process."
+ )
+ use_fused_mlp:bool=Field(default=True,
- init=False,
- metadata={
- "help":
- "Enable horizontal fusion in Gated-MLP that combines two Matmul "
- "operations into a single one followed by a separate SwiGLU kernel."
- })
- _pp_reduce_scatter:bool=field(
+ description=
+ "Enable horizontal fusion in Gated-MLP that combines two Matmul "
+ "operations into a single one followed by a separate SwiGLU kernel.")
+ pp_reduce_scatter:bool=Field(default=False,
- init=False,
- metadata={
- "help":
- "Enable a pipeline parallelism optimization with "
- "ReduceScatter + AllGather targeting large MoE models."
- })
+ description="Enable a pipeline parallelism optimization with "
+ "ReduceScatter + AllGather targeting large MoE models.")
- defupdate_from_dict(self,config:dict):
- fornameinconfig.keys():
- ifhasattr(self,name):
- value_to_be_update=config[name]
- ifisinstance(getattr(self,name),
- bool)orname=='paged_kv_cache':
- ifvalue_to_be_update=="enable":
- value_to_be_update=True
- elifvalue_to_be_update=="disable":
- value_to_be_update=False
- elifvalue_to_be_update=="disable":
- value_to_be_update=None
- setattr(self,name,value_to_be_update)
+ def__getattribute__(self,name:str)->Any:
+"""Override to resolve 'auto' values to dtype field.
+ When a plugin field has value 'auto', return the value of dtype instead.
+ """
+ # Use object.__getattribute__ to avoid infinite recursion
+ value=object.__getattribute__(self,name)
+
+ ifname!="dtype"andvalue=="auto":
+ returnself.dtype
+
+ returnvalue
+
+
+[docs]
+ @field_validator("*",mode="after")
+ @classmethod
+ deflog_field_changes(cls,v:Any,info:ValidationInfo)->Any:
+"""Log all field changes for debugging."""
+ logger.info(f"Set {cls.__name__}.{info.field_name} to {v}.")
+ returnv
+
+
+
+[docs]@classmethoddeffrom_arguments(cls,args:argparse.Namespace):
+"""Create a PluginConfig from argparse arguments."""args=vars(args)
- obj=cls.from_dict(args)
+ obj=cls(**args)# We want to know if the user explicitly disabled the gemm_plugin# because nvfp4 gemm uses plugin by default currentlyif'gemm_plugin'inargsandargs['gemm_plugin']=='disable':obj._explicitly_disable_gemm_plugin=True
- returnobj
+ returnobj
- defto_dict(self):
- config=asdict(self)
- # Remove prefix "_" of the storage name
- config={key.lstrip('_'):valueforkey,valueinconfig.items()}
- returnconfig
[docs]defto_legacy_setting(self):
-'''Legacy setting means that all of the plugins and features are
+"""Legacy setting means that all of the plugins and features are disabled, this is needed for the legacy `build.py` script, which will be migrated to the centralized building script `tensorrt_llm/commands/build.py`. After the migration is done, this function may or may not be deleted.
- '''
- forfieldinfields(self):
- # Remove prefix "_" of the storage name
- field_name=field.name.lstrip('_')
- iffield_name=='dtype':
+ """
+ forfield_name,field_valueinself:
+ iffield_name=="dtype":continue
- iffield.typein(str,Optional[str]):
+ elifisinstance(field_value,str):setattr(self,field_name,None)
- eliffield.type==boolorfield_name=='paged_kv_cache':
+ elifisinstance(field_value,
+ bool)orfield_name=="paged_kv_cache":setattr(self,field_name,False)
+
+[docs]defvalidate(self):unsupported_plugins={# bert_attention_plugin is handled within BertAttention
@@ -1014,7 +900,8 @@
val=getattr(self,plugin,None)ifvalisnotNoneandval!=False:raiseNotImplementedError(
- f"{plugin}={val} is not supported on SM {sm}.")
+ f"{plugin}={val} is not supported on SM {sm}.")
@@ -610,19 +614,6 @@
pass# noqa
-@dataclass(slots=True,kw_only=True)
-classAdditionalModelOutput:
-"""An additional output to gather from the model.
-
- Args:
- name (str): The name of the additional output to gather from the model.
- gather_context (bool): A value indicating whether or not to gather the additional output from the context too. Defaults to False.
- """# noqa: E501
-
- name:str
- gather_context:bool
-
-
[docs]@dataclass(slots=True,kw_only=True)
@@ -657,13 +648,25 @@
best_of (int, optional): Number of sequences to consider for best output. Defaults to None. use_beam_search (bool): Whether to use beam search. Defaults to False.
- top_k (int, optional): Controls number of logits to sample from. None means using C++ runtime default 0, i.e., all logits. Defaults to None.
- top_p (float, optional): Controls the top-P probability to sample from. None means using C++ runtime default 0.f. Defaults to None.
+ top_k (int, optional): Controls number of logits to sample from. Can assume non-negative values, where 0 means 'all logits'. Defaults to None.
+ The value None is treated as "not specified" in the following.
+ If neither temperature, top_p, nor top_k are specified, sampling is greedy.
+ If temperature > 0 and/or top_p < 1 are specified, sampling will proceed accordingly and top_k will default to top_k = 0.
+ Setting top_k = 1 results in greedy sampling.
+ top_p (float, optional): Controls the top-P probability to sample from. Can have values between 0 and 1. Defaults to None.
+ The value None is treated as "not specified" in the following.
+ If neither temperature, top_p, nor top_k are specified, sampling is greedy.
+ If temperature > 0 and/or top_k > 1 are specified, sampling will proceed accordingly and top_p will default to top_p = 1.
+ Setting top_p = 0 should result in greedy sampling, but is currently disallowed in the backend. top_p_min (float, optional): Controls decay in the top-P algorithm. topPMin is lower-bound. None means using C++ runtime default 1.e-6. Defaults to None. top_p_reset_ids (int, optional): Controls decay in the top-P algorithm. Indicates where to reset the decay. None means using C++ runtime default 1. Defaults to None. top_p_decay (float, optional): Controls decay in the top-P algorithm. The decay value. None means using C++ runtime default 1.f. Defaults to None. seed (int, optional): Controls the random seed used by the random number generator in sampling. None means using C++ runtime default 0. Defaults to None.
- temperature (float, optional): Controls the modulation of logits when sampling new tokens. It can have values > 0.f. None means using C++ runtime default 1.0f. Defaults to None.
+ temperature (float, optional): Controls the modulation of logits when sampling new tokens. It can have values >= 0.f. Defaults to None.
+ The value None is treated as "not specified" in the following.
+ If neither temperature, top_p, nor top_k are specified, sampling is greedy.
+ If top_p < 1 and/or top_k > 1 are specified, sampling will proceed accordingly and temperature will default to temperature = 1.
+ Setting temperature = 0 results in greedy sampling. min_tokens (int, optional): Lower bound on the number of tokens to generate. Values < 1 have no effect. None means using C++ runtime default 1. Defaults to None. beam_search_diversity_rate (float, optional): Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None. repetition_penalty (float, optional): Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None.
@@ -682,7 +685,7 @@
exclude_input_from_output (bool): Controls if output tokens in Result should include the input tokens. Defaults to True. return_encoder_output (bool): Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Defaults to False. return_perf_metrics (bool): Controls if Result should contain the performance metrics for this request. Defaults to False.
- additional_model_outputs (List[tensorrt_llm.sampling_params.AdditionalModelOutput], optional): The additional outputs to gather from the model. Defaults to None.
+ additional_model_outputs (List[str], optional): The additional outputs to gather from the model. Defaults to None. lookahead_config (tensorrt_llm.bindings.executor.LookaheadDecodingConfig , optional): Lookahead decoding config. Defaults to None. guided_decoding (tensorrt_llm.sampling_params.GuidedDecodingParams, optional): Guided decoding params. Defaults to None.
@@ -750,7 +753,7 @@
exclude_input_from_output:bool=Truereturn_encoder_output:bool=Falsereturn_perf_metrics:bool=False
- additional_model_outputs:Optional[List[AdditionalModelOutput]]=None
+ additional_model_outputs:Optional[List[str]]=None# Used in logprobs calculation in TRT flow to drop logits early if user did not explicitly request them.# Can be deprecated after migration to PyTorch backend.
@@ -799,11 +802,19 @@
For instance, while the greedy decoding with n > 1 is capable in the Executor class of C++ runtime, the LLM API disallows such combination. """
- ifself.best_of<self.n:
+ ifself.top_pisnotNoneand(self.top_p<0orself.top_p>1):
+ raiseValueError(f"require 0 <= top_p <= 1, got top_p={self.top_p}")
+ ifself.top_kisnotNoneandself.top_k<0:
+ raiseValueError(f"require top_k >= 0, got top_k={self.top_k}")
+ ifself.temperatureisnotNoneandself.temperature<0:
+ raiseValueError(f"require temperature >= 0, got temperature={self.temperature}")
+
+ ifself.best_ofisnotNoneandself.best_of<self.n:raiseValueError(f"best_of ({self.best_of}) cannot be less than n ({self.n})")if(
- self.best_of>1
+ self.best_ofisnotNone
+ andself.best_of>1andself._greedy_decodingandnotos.environ.get("TLLM_ALLOW_N_GREEDY_DECODING",None)):
@@ -827,12 +838,28 @@
self.logprobs=self.logprobsandint(self.logprobs)self.prompt_logprobs=self.prompt_logprobsandint(self.prompt_logprobs)
+ # NB: Static, because downstream code only holds instances of
+ # bindings.SamplingConfig (not SamplingParams).
+
+
+
@propertydef_greedy_decoding(self)->bool:
- return(
- notself.use_beam_search
- and(self.top_kisNoneorself.top_k==1)
- and(self.top_pisNoneorself.top_p==0.0)
+ returnnotself.use_beam_searchandself.params_imply_greedy_decoding(
+ temperature=self.temperature,
+ top_p=self.top_p,
+ top_k=self.top_k,)@property
@@ -981,6 +1008,12 @@
else:config_kwargs["return_log_probs"]=self._return_log_probs
+ ifconfig_kwargs.get("additional_model_outputs")isnotNone:
+ config_kwargs["additional_model_outputs"]=[
+ tllme.AdditionalModelOutput(name=output_name,gather_context=False)
+ foroutput_nameinconfig_kwargs["additional_model_outputs"]
+ ]
+
returntllme.OutputConfig(**config_kwargs)def_get_guided_decoding_params(self)->tllme.GuidedDecodingParams:
@@ -1125,9 +1158,9 @@
diff --git a/latest/_sources/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md.txt b/latest/_sources/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md.txt
new file mode 100644
index 0000000000..4b80603e29
--- /dev/null
+++ b/latest/_sources/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md.txt
@@ -0,0 +1,239 @@
+# Scaling Expert Parallelism in TensorRT LLM (Part 3: Pushing the Performance Boundary)
+
+This blog post is a continuation of previous posts:
+* [Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md)
+* [Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
+
+In this blog post, we focus on performance optimization, diving deeper into techniques such as lower precision, network structure refactoring, and aggressive kernel fusion. We hope this analysis and optimization process brings new inspiration to your model inference optimization work.
+
+*By NVIDIA TensorRT LLM Team*
+
+## Table of Contents
+- [Scaling Expert Parallelism in TensorRT LLM (Part 3: Pushing the Performance Boundary)](#scaling-expert-parallelism-in-tensorrt-llm-part-3-pushing-the-performance-boundary)
+ - [Table of Contents](#table-of-contents)
+ - [Overview](#overview)
+ - [Lower precision](#lower-precision)
+ - [wo GEMM FP4 quantization](#wo-gemm-fp4-quantization)
+ - [Low precision `AlltoAll`](#low-precision-alltoall)
+ - [FP8 context FMHA support](#fp8-context-fmha-support)
+ - [Rethink network structure](#rethink-network-structure)
+ - [MTP LM head tensor parallelism](#mtp-lm-head-tensor-parallelism)
+ - [Context phase Q/K/V `concat` optimization](#context-phase-qkv-concat-optimization)
+ - [More kernel overlap, fusion and optimization](#more-kernel-overlap-fusion-and-optimization)
+ - [Overlap kernels using programmatic dependent launch (PDL)](#overlap-kernels-using-programmatic-dependent-launch-pdl)
+ - [Fuse several `AlltoAll` kernels](#fuse-several-alltoall-kernels)
+ - [Fuse `add` (sparse exp and shared exp) into local reduction](#fuse-add-sparse-exp-and-shared-exp-into-local-reduction)
+ - [Optimize PyTorch native `copy` and `concat` using `torch.compile`](#optimize-pytorch-native-copy-and-concat-using-torchcompile)
+ - [End-to-End Performance](#end-to-end-performance)
+ - [Acknowledgements](#acknowledgements)
+
+## Overview
+
+Let's firstly take a look at how the network structure looks like before we did the optimizations, to give an overall review on how the workloads look like:
+
+
+
+
+
+
+
Figure 1: Network structure overview before optimization
+
+In this third blog of our scaling Expert Parallelism (EP) series, we push the performance boundaries of large-scale EP on NVIDIA GB200 NVL72 through multiple optimization techniques. Building upon the foundation established in [part 1](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md) and [part 2](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md), this blog explores three key optimization pillars: **lower precision computation** (including FP4 quantization for wo GEMM, low-precision AlltoAll communication, and FP8 context FMHA), **network structure rethinking** (featuring MTP LM head tensor parallelism and context phase Q/K/V concatenation elimination), and **aggressive kernel fusion and overlap** (leveraging Programmatic Dependent Launch, fused AlltoAll operations, and torch.compile optimizations). These optimizations collectively deliver significant end-to-end performance improvements for wide-EP scenarios on NVIDIA GB200 NVL72, for DeepSeek R1 with its specialized Multi-head Latent Attention (MLA) mechanism. Each technique is carefully designed to maintain accuracy while maximizing performance, demonstrating the power of combining algorithmic innovation with deep hardware awareness.
+
+## Lower precision
+
+### wo GEMM FP4 quantization
+
+The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1's MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, "wo" is the abbreviation for the weight matrix for the output.
+
+We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
+* https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2
+* https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2
+
+In TensorRT LLM, this is supported by [PR 6393](https://github.com/NVIDIA/TensorRT-LLM/pull/6393). To utilize the checkpoints, simply use the LLM API or `trtllm-serve` to load them. Refer to [deploy-with-tensorrt-llm](https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2#deploy-with-tensorrt-llm) for more details.
+
+### Low precision `AlltoAll`
+
+In wide-EP MoE, the combine phase (after experts finish FC2) performs an all-to-all to return each token’s expert outputs to its origin rank, followed by a per-token reduce over top-k experts.
+
+This step is typically bandwidth-bound when FC2 outputs are in BF16 or FP16. We introduce a low-precision AlltoAll that transmits these combine payloads in NVFP4 instead of BF16/FP16, then dequantizes back on the receiver before the local reduction.
+
+During combine, we temporarily quantize the per-token expert outputs to NVFP4 (e2m1 values with per-16-element E4M3 scale factors plus a global scale) inside shared memory, send the compact representation across GPUs, and dequantize back to the original dtype on the receiving side. Indices and routing-related small tensors remain in their native types.
+
+Since we quantize only for transport and outputs are dequantized back to the working dtype before the per-token reduction, we observe negligible accuracy impact; tolerances comparable to a quant-dequant roundtrip are sufficient. This feature is supported by [PR 7155](https://github.com/NVIDIA/TensorRT-LLM/pull/7155) and [PR 7898](https://github.com/NVIDIA/TensorRT-LLM/pull/7898).
+
+### FP8 context FMHA support
+
+FP8 context FMHA is a technique that uses the FP8 data format to accelerate the FMHA/MLA computation during the context phase of a model. This combination is designed to improve TTFT and prefill throughput, particularly when processing long contexts, without significantly sacrificing accuracy.
+
+In the context phase, the K and V can be stored in FP8 format, which is often referred to as FP8 KV Cache. Using FP8 KV cache can significantly save GPU memory, which is especially beneficial for long input sequences.
+However, since Q is in BF16 format, FMHA will also be performed in BF16 format, which cannot benefit from FP8 Tensor Core.
+
+With FP8 context FMHA, we first quantize Q into FP8 format, which aligns with FP8 K and V, and then leverage FP8 Tensor Core for FMHA/MLA. Since the context phase is compute-bound and Tensor Core has much higher FP8 FLOPS than BF16 FLOPS, the speed-up becomes more pronounced as the input sequence length grows.
+
+Since FP8 context FMHA can maintain accuracy very close to the BF16 baseline, we enable it automatically when users use FP8 KV cache on Hopper or Blackwell. This is supported by [PR 7610](https://github.com/NVIDIA/TensorRT-LLM/pull/7610) and [PR 7612](https://github.com/NVIDIA/TensorRT-LLM/pull/7612).
+
+## Rethink network structure
+
+### MTP LM head tensor parallelism
+
+The LM (language modeling) head is responsible for converting the `hidden_states` computed by previous decode layers to `logits`. It's a linear layer with weights in the shape of `(vocab_size, hidden_size)`, outputting logits with the shape of `(batch_size, seqlen, vocab_size)`. We are primarily interested in the logits corresponding to the last token of the input sequence, so the logits will finally be `(batch_size, vocab_size)`.
+
+When MTP is enabled, the number of tokens that MTP layers handle will be equal to the batch size, while the main model will handle `(1 + MTP) * batch_size` tokens, which makes the LM head computation on MTP layers easier to fall into the memory-bound range, and 256 tokens is the empirical boundary between memory-bound and math-bound. This leads to an optimization idea: if we keep the calculation memory-bound but reduce the size of weights that need to be loaded, there could be performance benefits.
+
+Based on this analysis, we conducted experiments on the following scenario: a DeepSeek R1 EP32 case with attention DP and MTP-3 enabled, where the local per-rank batch size is 32. Before the optimization, there is 32-way data parallelism, so each MTP module on each rank processes 32 tokens for LM head calculation.
+
+
+
+
+
+
+
Figure 2: MTP LM head computation before optimization
+
+In the optimization, we first perform an `AllGather` on every 4 GPUs, so that each GB200 node has all tokens prepared for the following TP4 calculation. Then, we split LM head weights on the token dimension to fit those 4 GPUs and perform 4-way TP. Afterwards, we collect the local argmax logits on each TP rank, do a round of `AllGather` to collect that, and find the global argmax logits across all TP ranks. Collecting the local argmax logits firstly helps with minimizing communication and argmax computation overheads. Finally, we split logits to guarantee correctness.
+
+
+
+
+
+
+
Figure 3: MTP LM head computation after applying tensor parallelism
+
+*Some layers are omitted in the diagrams above to keep the example simple.*
+
+Note that we can expand the TP to 8-way to utilize multi-node NVLink, as long as we still achieve performance gains from reducing weight loading time in memory-bound scenarios.
+
+This feature is supported by [PR 7571](https://github.com/NVIDIA/TensorRT-LLM/pull/7571) and [PR 7891](https://github.com/NVIDIA/TensorRT-LLM/pull/7891).
+
+### Context phase Q/K/V `concat` optimization
+
+In the standard attention mechanism, Q/K/V are derived from the same hidden states through `GEMM_Q`/`GEMM_K`/`GEMM_V` operations, and TensorRT LLM typically merges the weights of these three GEMMs in advance, executing a single `GEMM_QKV` to obtain a large contiguous tensor QKV, which is then used as the input to the attention kernels.
+
+However, DeepSeek's MLA is a special attention module where Q/K/V are obtained by applying different downsampling-upsampling processes to the hidden states. Additionally, Q and K are divided into two parts: with RoPE and without RoPE, so a contiguous QKV tensor cannot be obtained directly.
+
+In the initial implementation of context MLA, due to input format constraints of the attention kernels, TensorRT LLM had to explicitly concatenate the Q/K/V tensors into one contiguous QKV tensor, resulting in extra memory and time overhead, which became more significant in wide EP scenarios.
+
+Recently, we introduced a new input format for the context MLA kernels called "separate qkv". As the name implies, these attention kernels now support three separate Q/K/V tensors as direct inputs. [PR 6538](https://github.com/NVIDIA/TensorRT-LLM/pull/6538) refactors the MLA process to eliminate the need for concatenating Q/K/V, saving copy operations and significantly improving prefill latency in wide EP scenarios.
+
+## More kernel overlap, fusion and optimization
+
+The team has implemented aggressive kernel fusion, overlap, and optimization to reduce kernel launch overheads and overall kernel duration. This includes overlapping kernels using PDL, fusing several `AlltoAll` kernels through refactoring, fusing sparse exp and shared exp `add` into local reduction, fusing `memset` into `expandinputrow`, fusing `finalizeMoeRouting` into FC2, and removing the `swizzle` kernel after `AlltoAll`. The following three representative examples demonstrate the common ideas behind these optimizations.
+
+### Overlap kernels using programmatic dependent launch (PDL)
+
+The Programmatic Dependent Launch (PDL) mechanism allows a dependent secondary kernel to launch before the primary kernel it depends on in the same CUDA stream has finished executing. Refer to the [official documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization) for more details. TensorRT LLM has been utilizing this feature to optimize end-to-end performance.
+
+We have introduced this feature to the kernels used by the wide EP workflow as well. The implementation is in [PR 7977](https://github.com/NVIDIA/TensorRT-LLM/pull/7977). We inserted the `cudaTriggerProgrammaticLaunchCompletion` API with all thread blocks in the primary kernel, which signals that it's ready for the secondary kernel to launch, and then call the `cudaGridDependencySynchronize` API in the secondary kernel, which blocks until all primary kernels the secondary kernel depends on have completed and flushed results to global memory. The following example from the [official documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#api-description) demonstrates how PDL is supported in TensorRT LLM, the only difference is that we inserted `cudaTriggerProgrammaticLaunchCompletion` and `cudaGridDependencySynchronize` to the same kernel so that it can both overlap with the front and subsequent kernels.
+```c
+__global__ void primary_kernel() {
+ // Initial work that should finish before starting secondary kernel
+
+ // Trigger the secondary kernel
+ cudaTriggerProgrammaticLaunchCompletion();
+
+ // Work that can coincide with the secondary kernel
+}
+
+__global__ void secondary_kernel()
+{
+ // Independent work
+
+ // Will block until all primary kernels the secondary kernel is dependent on have completed and flushed results to global memory
+ cudaGridDependencySynchronize();
+
+ // Dependent work
+}
+```
+
+We have verified the accuracy after the modification to ensure that computation results are not affected by incorrect memory reads and writes. With this premise, we made those kernels overlap as much as possible for performance considerations. In TensorRT LLM, PDL can be enabled by setting the environment variable `TRTLLM_ENABLE_PDL` to `1`, and we may introduce this as an official API in the future.
+
+The effect of enabling PDL can be clearly observed using [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems). Taking `moeComputeRouteKernel`, `computeCountAndIndiceDevice` and `computeCumsumDevice` kernels as an example, they are executed in order when disabling PDL:
+
+
+
+
+
+
+
Figure 4: The profiling results of disabling PDL.
+
+The following profiling results show how the three kernels overlap after enabling PDL.
+
+
+
+
+
+
+
Figure 5: The profiling results of enabling PDL.
+
+*The above profiles were generated by using commit [84d2f12](https://github.com/NVIDIA/TensorRT-LLM/tree/84d2f1281857fbb1662b14603d3123cf327ac94f) on the main branch. They may change in future versions.*
+
+For tips on using Nsys to profile and analyze TensorRT LLM performance, refer to [Coordinating with NVIDIA Nsight Systems Launch](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/developer-guide/perf-analysis.md#coordinating-with-nvidia-nsight-systems-launch).
+
+### Fuse several `AlltoAll` kernels
+
+To better support communication fusion—including `hiddenStates` during dispatch, low-precision ScalingFactor, MoE's `tokenSelectedExpert` and scales, as well as supporting low-precision communication during dispatch and handling potential non-alignment issues in original data, we redesigned and reimplemented `AlltoAll`.
+
+Taking the dispatch of four fields as an example, the data flow is shown in Figure 6.
+
+
+
+
+
+
+
Figure 6: The data flow of new Alltoall kernel
+
+The sending process is as follows:
+- The first step loads the original data according to the data alignment in global memory, using TMA to load into shared memory as `unAlignedData`.
+- Next, in shared memory, all fields are aligned to 16-byte boundaries and different fields are concatenated together to form `alignedData`.
+- If low-precision communication is needed, the aligned data is quantized into low-precision `lowPrecisionData`. Currently, quantization is only supported for a single field.
+- Next, corresponding encoding is performed according to the protocol. For example, with LL128, each 128 bytes contains 120 bytes of valid data and 8 bytes of flags. To avoid bank conflicts during encoding in shared memory, we select different flag positions for different packets, and the final encoded data is stored in `protoPackedData+Flag`.
+- Finally, the proto-encoded `protoPackedData+Flag` is written to the remote GPU's workspace.
+
+For the receiver, it only needs to check the flag at the corresponding position in the workspace to confirm whether the data is ready. If ready, the original data is decoded in the reverse manner of sending and written to the corresponding tensors.
+
+Through this approach, we can support sending and receiving multiple arbitrarily aligned fields in a fused manner and support low-precision communication during the combine process. This feature was implemented in [PR 6973](https://github.com/NVIDIA/TensorRT-LLM/pull/6973).
+
+### Fuse `add` (sparse exp and shared exp) into local reduction
+
+To reduce the number of kernel launches and achieve better overlap at the tail of the MoE module, we've fused the shared-expert add into the local reduction kernel that aggregates top-k experts. This removes the extra add operator without increasing the reduce operator's overhead. It also achieves single write-out and lower bandwidth occupancy.
+
+The optimization is compatible with NVFP4 combine without requiring any API changes and brings no accuracy impact. It was added by [PR 7422](https://github.com/NVIDIA/TensorRT-LLM/pull/7422).
+
+### Optimize PyTorch native `copy` and `concat` using `torch.compile`
+
+We have observed several inefficient `copy` and `concat` operations on context phase in wide EP scenarios, and one significant case is copying `k_nope` in the MLA module. As mentioned in previous section, Q and K are divided into two parts in DeepSeek MLA: with RoPE and without RoPE. In context phase, head size of nope will be 128, and that of rope will be 64, which adds up to 192 head size. However, the FMHA kernel will directly read Q and K with head size 192, which means that we have to prepare the full Q and K using `copy` and `concat`.
+
+On ISL/OSL 8k/1k, batch size 1 cases, on context phase, we observed that the `copy` operation takes 306us, which is clearly suboptimal. If we try to calculate a theoretical duration, considering 8 TB/sec HBM3e bandwidth, the formula would roughly be:
+```
+( ISL 8192 * k_nope_size 128 * num_heads 128 * 2 bytes * read/write 2 ) / ( 8 TB/sec * efficiency 0.8 ) = 80 us
+```
+
+To optimize the operator, we simply added `torch.compile` decorator to the operation, and the kernel duration directly drops to 107us, which is greatly reduced and already on a promising level. [PR 8044](https://github.com/NVIDIA/TensorRT-LLM/pull/8044) implemented the changes. This is an outstanding example demonstrating the power of `torch.compile`, and showing the process of analyzing and optimizing without heavily hand-crafting kernels.
+
+## End-to-End Performance
+
+After applying the optimizations above, the network structure is cleaner. For example, `o_proj` and `A2A tokens` now compute in lower precision, and operators like `add` of sparse‑expert and shared‑expert is now fused into the `reduction`. The optimized parts are marked in **bold**.
+
+
+
+
+
+
+
Figure 7: Network structure overview after optimization
+
+We measured one round of performance and compared it with the baseline (main branch in July). With the optimizations mentioned above, we can see a significant performance improvement.
+
+
+
+
+
+
Figure 8: End-to-End Performance on Aug 31st
+
+*Note: The numbers were collected on August 31st. Some optimizations mentioned above were not yet added at that time.*
+
+To review how wide EP helps with Blackwell's leading inference benchmarks, also read these recent blog posts:
+* [NVIDIA Blackwell Leads on SemiAnalysis InferenceMAX™ v1 Benchmarks](https://developer.nvidia.com/blog/nvidia-blackwell-leads-on-new-semianalysis-inferencemax-benchmarks/)
+* [NVIDIA Blackwell Raises Bar in New InferenceMAX Benchmarks, Delivering Unmatched Performance and Efficiency](https://blogs.nvidia.com/blog/blackwell-inferencemax-benchmark-results/)
+
+## Acknowledgements
+This is a great continuation of previous work on TensorRT-LLM wide EP and another demonstration of excellent teamwork. It stems from brilliant performance optimization ideas, solid performance analysis and benchmarking, and rapid engineering support and implementation. By sharing these experiences, we hope to help more people who are interested in deploying large-scale LLM models on NVIDIA GPUs to run AI faster.
diff --git a/latest/_sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt b/latest/_sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt
index 0cc2296a9e..d76b53763f 100644
--- a/latest/_sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt
+++ b/latest/_sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt
@@ -25,7 +25,7 @@ TensorRT LLM distributes the pre-built container on [NGC Catalog](https://catalo
You can launch the container using the following command:
```bash
-docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc0
+docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc1
```
@@ -151,16 +151,44 @@ P99 E2EL (ms): 1643.44
### Key Metrics
-* Median Time to First Token (TTFT)
+#### Time to First Token (TTFT)
* The typical time elapsed from when a request is sent until the first output token is generated.
-* Median Time Per Output Token (TPOT)
- * The typical time required to generate each token *after* the first one.
-* Median Inter-Token Latency (ITL)
- * The typical time delay between the completion of one token and the completion of the next.
-* Median End-to-End Latency (E2EL)
+
+#### Time Per Output Token (TPOT) and Inter-Token Latency (ITL)
+ * TPOT is the typical time required to generate each token *after* the first one.
+ * ITL is the typical time delay between the completion of one token and the completion of the next.
+ * Both TPOT and ITL ignore TTFT.
+
+For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
+
+```math
+\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
+```
+
+Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
+
+```math
+\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
+```
+
+```math
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
+```
+
+#### End-to-End (E2E) Latency
* The typical total time from when a request is submitted until the final token of the response is received.
-* Total Token Throughput
+
+#### Total Token Throughput
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
+```math
+\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
+
+#### Tokens Per Second (TPS) or Output Token Throughput
+ * how many output tokens the system generates each second.
+```math
+\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
## About `extra_llm_api_options`
trtllm-serve provides `extra_llm_api_options` knob to **overwrite** the parameters specified by trtllm-serve.
@@ -267,28 +295,28 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
Below is some example TensorRT-LLM serving benchmark output. Your actual results may vary.
```
============ Serving Benchmark Result ============
-Successful requests: 1
-Benchmark duration (s): 0.83
-Total input tokens: 128
-Total generated tokens: 128
-Request throughput (req/s): 1.20
-Output token throughput (tok/s): 153.92
-Total Token throughput (tok/s): 307.85
-User throughput (tok/s): 154.15
-Mean Request AR: 0.9845
-Median Request AR: 0.9845
+Successful requests: 1
+Benchmark duration (s): 0.83
+Total input tokens: 128
+Total generated tokens: 128
+Request throughput (req/s): 1.20
+Output token throughput (tok/s): 153.92
+Total Token throughput (tok/s): 307.85
+User throughput (tok/s): 154.15
+Mean Request AR: 0.9845
+Median Request AR: 0.9845
---------------Time to First Token----------------
-Mean TTFT (ms): 84.03
-Median TTFT (ms): 84.03
-P99 TTFT (ms): 84.03
+Mean TTFT (ms): 84.03
+Median TTFT (ms): 84.03
+P99 TTFT (ms): 84.03
-----Time per Output Token (excl. 1st token)------
-Mean TPOT (ms): 5.88
-Median TPOT (ms): 5.88
-P99 TPOT (ms): 5.88
+Mean TPOT (ms): 5.88
+Median TPOT (ms): 5.88
+P99 TPOT (ms): 5.88
---------------Inter-token Latency----------------
-Mean ITL (ms): 5.83
-Median ITL (ms): 5.88
-P99 ITL (ms): 6.14
+Mean ITL (ms): 5.83
+Median ITL (ms): 5.88
+P99 ITL (ms): 6.14
==================================================
```
diff --git a/latest/_sources/deployment-guide/index.rst.txt b/latest/_sources/deployment-guide/index.rst.txt
index b258b68c1e..2327de5000 100644
--- a/latest/_sources/deployment-guide/index.rst.txt
+++ b/latest/_sources/deployment-guide/index.rst.txt
@@ -10,3 +10,4 @@ Model Recipes
quick-start-recipe-for-llama3.3-70b-on-trtllm.md
quick-start-recipe-for-llama4-scout-on-trtllm.md
quick-start-recipe-for-gpt-oss-on-trtllm.md
+ quick-start-recipe-for-qwen3-next-on-trtllm.md
diff --git a/latest/_sources/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md.txt b/latest/_sources/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md.txt
index 8d811c6547..87394c8cdd 100644
--- a/latest/_sources/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md.txt
@@ -22,7 +22,7 @@ The guide is intended for developers and practitioners seeking high-throughput o
## MoE Backend Support Matrix
-There are multiple MOE backends inside TRT-LLM, not all of them supporting every precision on every GPUs. Here are the support matrix of the MOE backends.
+There are multiple MOE backends inside TensorRT LLM, not all of them supporting every precision on every GPUs. Here are the support matrix of the MOE backends.
| device | Checkpoint | Supported moe_backend |
|----------|----------|----------|
@@ -58,9 +58,9 @@ Note:
* The command also maps port `8000` from the container to your host so you can access the LLM API endpoint from your host
* See the for all the available containers. The containers published in the main branch weekly have `rcN` suffix, while the monthly release with QA tests has no `rcN` suffix. Use the `rc` release to get the latest model and feature support.
-If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
+If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
-### Creating the TRT-LLM Server config
+### Creating the TensorRT LLM Server config
We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings.
@@ -103,15 +103,14 @@ moe_config:
EOF
```
-### Launch the TRT-LLM Server
+### Launch the TensorRT LLM Server
-Below is an example command to launch the TRT-LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
+Below is an example command to launch the TensorRT LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
```shell
trtllm-serve deepseek-ai/DeepSeek-R1-0528 \
--host 0.0.0.0 \
--port 8000 \
- --backend pytorch \
--max_batch_size 1024 \
--max_num_tokens 3200 \
--max_seq_len 2048 \
@@ -141,9 +140,6 @@ These options are used directly on the command line when you start the `trtllm-s
* **Description:** A value between `0.0` and `1.0` that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
* **Recommendation:** If you experience OOM errors, try reducing this value to `0.7` or lower.
-#### `--backend pytorch`
-
- **Description:** Tells TensorRT LLM to use the **pytorch** backend.
#### `--max_batch_size`
@@ -230,7 +226,7 @@ Refer to the wide EP [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main
### Basic Test
-Start a new terminal on the host to test the TensorRT LLM server you just launched.
+Start a new terminal on the host to test the TensorRT LLM server you just launched.
You can query the health/readiness of the server using:
@@ -240,7 +236,7 @@ curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
```shell
curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
@@ -251,7 +247,7 @@ curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -
}'
```
-Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
+Here is an example response, showing that the TensorRT LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
```json
{"id":"cmpl-e728f08114c042309efeae4df86a50ca","object":"text_completion","created":1754294810,"model":"deepseek-ai/DeepSeek-R1-0528","choices":[{"index":0,"text":" / by Megan Stine ; illustrated by John Hinderliter.\n\nBook | Gross","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null}
@@ -318,7 +314,7 @@ Sample result in Blackwell:
## Benchmarking Performance
-To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script.
+To benchmark the performance of your TensorRT LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script.
```shell
cat < bench.sh
@@ -358,7 +354,7 @@ If you want to save the results to a file add the following options.
--result-filename "concurrency_${concurrency}.json"
```
-For more benchmarking options see .
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
@@ -399,13 +395,41 @@ P99 E2EL (ms): [result]
### Key Metrics
-* Median Time to First Token (TTFT)
+#### Time to First Token (TTFT)
* The typical time elapsed from when a request is sent until the first output token is generated.
-* Median Time Per Output Token (TPOT)
- * The typical time required to generate each token *after* the first one.
-* Median Inter-Token Latency (ITL)
- * The typical time delay between the completion of one token and the completion of the next.
-* Median End-to-End Latency (E2EL)
+
+#### Time Per Output Token (TPOT) and Inter-Token Latency (ITL)
+ * TPOT is the typical time required to generate each token *after* the first one.
+ * ITL is the typical time delay between the completion of one token and the completion of the next.
+ * Both TPOT and ITL ignore TTFT.
+
+For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
+
+```math
+\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
+```
+
+Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
+
+```math
+\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
+```
+
+```math
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
+```
+
+#### End-to-End (E2E) Latency
* The typical total time from when a request is submitted until the final token of the response is received.
-* Total Token Throughput
+
+#### Total Token Throughput
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
+```math
+\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
+
+#### Tokens Per Second (TPS) or Output Token Throughput
+ * how many output tokens the system generates each second.
+```math
+\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
diff --git a/latest/_sources/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md.txt b/latest/_sources/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md.txt
index 243b2609e8..16da732a1d 100644
--- a/latest/_sources/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md.txt
@@ -21,7 +21,7 @@ The guide is intended for developers and practitioners seeking high-throughput o
## MoE Backend Support Matrix
-There are multiple MOE backends inside TRT-LLM. Here are the support matrix of the MOE backends.
+There are multiple MOE backends inside TensorRT LLM. Here are the support matrix of the MOE backends.
| Device | Activation Type | MoE Weights Type | MoE Backend | Use Case |
|------------|------------------|------------------|-------------|----------------|
@@ -56,7 +56,7 @@ Note:
If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to .
-### Creating the TRT-LLM Server config
+### Creating the TensorRT LLM Server config
We create a YAML configuration file `/tmp/config.yml` for the TensorRT-LLM Server and populate it with the following recommended performance settings.
@@ -98,15 +98,14 @@ attention_dp_config:
EOF
```
-### Launch the TRT-LLM Server
+### Launch the TensorRT LLM Server
-Below is an example command to launch the TRT-LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
+Below is an example command to launch the TensorRT LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
```shell
trtllm-serve openai/gpt-oss-120b \
--host 0.0.0.0 \
--port 8000 \
- --backend pytorch \
--max_batch_size 720 \
--max_num_tokens 16384 \
--kv_cache_free_gpu_memory_fraction 0.9 \
@@ -135,10 +134,6 @@ These options are used directly on the command line when you start the `trtllm-s
* **Description:** A value between `0.0` and `1.0` that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
* **Recommendation:** If you experience OOM errors, try reducing this value to `0.7` or lower.
-#### `--backend pytorch`
-
-* **Description:** Tells TensorRT-LLM to use the **pytorch** backend.
-
#### `--max_batch_size`
* **Description:** The maximum number of user requests that can be grouped into a single batch for processing. The actual max batch size that can be achieved depends on total sequence length (input + output).
@@ -201,7 +196,7 @@ curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
```shell
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
@@ -217,7 +212,7 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
}' -w "\n"
```
-Here is an example response, showing that the TRT-LLM server reasons and answers the questions.
+Here is an example response, showing that the TensorRT LLM server reasons and answers the questions.
TODO: Use Chat Compeletions API / Responses API as the example after the PR is merged.
@@ -238,7 +233,7 @@ TODO: Use Chat Compeletions API / Responses API as the example after the PR is m
We use OpenAI's official evaluation tool to test the model's accuracy. For more information see [https://github.com/openai/gpt-oss/tree/main/gpt_oss/evals](gpt-oss-eval).
With the added support of Chat Completions and Responses API in `trtllm-serve,` `gpt_oss.evals` works directly without any modifications.
-You need to set `enable_attention_dp`, `tp_size`, `ep_size`, `max_batch_size` and `max_num_tokens` when launching the trtllm server and set `reasoning-effort` when launching evaluation in gpt-oss. Below are some reference configurations for accuracy evaluation on B200.
+You need to set `enable_attention_dp`, `tp_size`, `ep_size`, `max_batch_size` and `max_num_tokens` when launching the trtllm server and set `reasoning-effort` when launching evaluation in gpt-oss. Below are some reference configurations for accuracy evaluation on B200.
| **reasoning-effort** | **parallel configuration** | **max_batch_size** | **max_num_tokens** |
|:--------------------:|:--------------------------:|:------------------:|:------------------:|
@@ -305,7 +300,7 @@ If you want to save the results to a file add the following options.
--result-filename "concurrency_${concurrency}.json"
```
-For more benchmarking options see .
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
@@ -346,13 +341,41 @@ P99 E2EL (ms): [result]
### Key Metrics
-* Median Time to First Token (TTFT)
+#### Time to First Token (TTFT)
* The typical time elapsed from when a request is sent until the first output token is generated.
-* Median Time Per Output Token (TPOT)
- * The typical time required to generate each token *after* the first one.
-* Median Inter-Token Latency (ITL)
- * The typical time delay between the completion of one token and the completion of the next.
-* Median End-to-End Latency (E2EL)
+
+#### Time Per Output Token (TPOT) and Inter-Token Latency (ITL)
+ * TPOT is the typical time required to generate each token *after* the first one.
+ * ITL is the typical time delay between the completion of one token and the completion of the next.
+ * Both TPOT and ITL ignore TTFT.
+
+For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
+
+```math
+\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
+```
+
+Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
+
+```math
+\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
+```
+
+```math
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
+```
+
+#### End-to-End (E2E) Latency
* The typical total time from when a request is submitted until the final token of the response is received.
-* Total Token Throughput
+
+#### Total Token Throughput
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
+```math
+\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
+
+#### Tokens Per Second (TPS) or Output Token Throughput
+ * how many output tokens the system generates each second.
+```math
+\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
diff --git a/latest/_sources/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md.txt b/latest/_sources/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md.txt
index ed7dd58d41..011920a9f0 100644
--- a/latest/_sources/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md.txt
@@ -12,15 +12,15 @@ To use Llama 3.3-70B, you must first agree to Meta’s Llama 3 Community License
## Prerequisites
-GPU: NVIDIA Blackwell or Hopper Architecture
-OS: Linux
-Drivers: CUDA Driver 575 or Later
-Docker with NVIDIA Container Toolkit installed
+GPU: NVIDIA Blackwell or Hopper Architecture
+OS: Linux
+Drivers: CUDA Driver 575 or Later
+Docker with NVIDIA Container Toolkit installed
Python3 and python3-pip (Optional, for accuracy evaluation only)
## Models
-* FP8 model: [Llama-3.3-70B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8)
+* FP8 model: [Llama-3.3-70B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8)
* NVFP4 model: [Llama-3.3-70B-Instruct-FP4](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP4)
@@ -43,16 +43,16 @@ nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6 \
/bin/bash
```
-Note:
+Note:
-* You can mount additional directories and paths using the \-v \:\ flag if needed, such as mounting the downloaded weight paths.
-* The command mounts your user .cache directory to save the downloaded model checkpoints which are saved to \~/.cache/huggingface/hub/ by default. This prevents having to redownload the weights each time you rerun the container. If the \~/.cache directory doesn’t exist please create it using mkdir \~/.cache
-* The command also maps port **8000** from the container to your host so you can access the LLM API endpoint from your host
+* You can mount additional directories and paths using the \-v \:\ flag if needed, such as mounting the downloaded weight paths.
+* The command mounts your user .cache directory to save the downloaded model checkpoints which are saved to \~/.cache/huggingface/hub/ by default. This prevents having to redownload the weights each time you rerun the container. If the \~/.cache directory doesn’t exist please create it using mkdir \~/.cache
+* The command also maps port **8000** from the container to your host so you can access the LLM API endpoint from your host
* See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support.
-If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
+If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
-### Creating the TRT-LLM Server config
+### Creating the TensorRT LLM Server config
We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings.
@@ -64,20 +64,19 @@ enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1024
-kv_cache_config:
+kv_cache_config:
dtype: fp8
EOF
```
-### Launch the TRT-LLM Server
+### Launch the TensorRT LLM Server
-Below is an example command to launch the TRT-LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
+Below is an example command to launch the TensorRT LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
```shell
trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
--host 0.0.0.0 \
--port 8000 \
- --backend pytorch \
--max_batch_size 1024 \
--max_num_tokens 2048 \
--max_seq_len 2048 \
@@ -107,10 +106,6 @@ These options are used directly on the command line when you start the `trtllm-s
**Recommendation:** If you experience OOM errors, try reducing this value to **0.8** or lower.
-#### `--backend pytorch`
-
- **Description:** Tells TensorRT LLM to use the **pytorch** backend.
-
#### `--max_batch_size`
**Description:** The maximum number of user requests that can be grouped into a single batch for processing.
@@ -136,7 +131,7 @@ These options provide finer control over performance and are set within a YAML f
**Description**: A section for configuring the Key-Value (KV) cache.
- **Options**:
+ **Options**:
dtype: Sets the data type for the KV cache.
@@ -184,7 +179,7 @@ See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.h
### Basic Test
-Start a new terminal on the host to test the TensorRT LLM server you just launched.
+Start a new terminal on the host to test the TensorRT LLM server you just launched.
You can query the health/readiness of the server using:
@@ -194,7 +189,7 @@ curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
```shell
curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
@@ -205,7 +200,7 @@ curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -
}'
```
-Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
+Here is an example response, showing that the TensorRT LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
```json
{"id":"cmpl-bc1393d529ce485c961d9ffee5b25d72","object":"text_completion","created":1753843963,"model":"nvidia/Llama-3.3-70B-Instruct-FP8","choices":[{"index":0,"text":" New York is a state located in the northeastern United States. It is bordered by","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null}
@@ -213,10 +208,10 @@ Here is an example response, showing that the TRT-LLM server returns “New York
### Troubleshooting Tips
-* If you encounter CUDA out-of-memory errors, try reducing max\_batch\_size or max\_seq\_len
-* Ensure your model checkpoints are compatible with the expected format
-* For performance issues, check GPU utilization with nvidia-smi while the server is running
-* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed
+* If you encounter CUDA out-of-memory errors, try reducing max\_batch\_size or max\_seq\_len
+* Ensure your model checkpoints are compatible with the expected format
+* For performance issues, check GPU utilization with nvidia-smi while the server is running
+* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed
* For connection issues, make sure port 8000 is not being used by another application
### Running Evaluations to Verify Accuracy (Optional)
@@ -241,7 +236,7 @@ MODEL_PATH=nvidia/Llama-3.3-70B-Instruct-FP8
lm_eval --model local-completions --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0,add_special_tokens=False --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp8.gsm8k
```
-Sample result in Blackwell.
+Sample result in Blackwell.
```
|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
@@ -271,7 +266,7 @@ Sample result in Blackwell
## Benchmarking Performance
-To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script.
+To benchmark the performance of your TensorRT LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script.
```shell
cat < bench.sh
@@ -311,7 +306,7 @@ If you want to save the results to a file add the following options.
--result-filename "concurrency_${concurrency}.json"
```
-For more benchmarking options see. [https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt\_llm/serve/scripts/benchmark\_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
Run bench.sh to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above bench.sh script.
@@ -352,13 +347,41 @@ P99 E2EL (ms): [result]
### Key Metrics
-* Median Time to First Token (TTFT)
- * The typical time elapsed from when a request is sent until the first output token is generated.
-* Median Time Per Output Token (TPOT)
- * The typical time required to generate each token *after* the first one.
-* Median Inter-Token Latency (ITL)
- * The typical time delay between the completion of one token and the completion of the next.
-* Median End-to-End Latency (E2EL)
- * The typical total time from when a request is submitted until the final token of the response is received.
-* Total Token Throughput
- * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
+#### Time to First Token (TTFT)
+ * The typical time elapsed from when a request is sent until the first output token is generated.
+
+#### Time Per Output Token (TPOT) and Inter-Token Latency (ITL)
+ * TPOT is the typical time required to generate each token *after* the first one.
+ * ITL is the typical time delay between the completion of one token and the completion of the next.
+ * Both TPOT and ITL ignore TTFT.
+
+For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
+
+```math
+\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
+```
+
+Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
+
+```math
+\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
+```
+
+```math
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
+```
+
+#### End-to-End (E2E) Latency
+ * The typical total time from when a request is submitted until the final token of the response is received.
+
+#### Total Token Throughput
+ * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
+```math
+\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
+
+#### Tokens Per Second (TPS) or Output Token Throughput
+ * how many output tokens the system generates each second.
+```math
+\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
diff --git a/latest/_sources/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md.txt b/latest/_sources/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md.txt
index b4246f024a..0ea925e471 100644
--- a/latest/_sources/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md.txt
+++ b/latest/_sources/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md.txt
@@ -51,7 +51,7 @@ Note:
If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
-### Creating the TRT-LLM Server config
+### Creating the TensorRT LLM Server config
We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings.
@@ -68,15 +68,14 @@ kv_cache_config:
EOF
```
-### Launch the TRT-LLM Server
+### Launch the TensorRT LLM Server
-Below is an example command to launch the TRT-LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
+Below is an example command to launch the TensorRT LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
```shell
trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 \
--host 0.0.0.0 \
--port 8000 \
- --backend pytorch \
--max_batch_size 1024 \
--max_num_tokens 2048 \
--max_seq_len 2048 \
@@ -106,10 +105,6 @@ These options are used directly on the command line when you start the `trtllm-s
* **Description:** A value between `0.0` and `1.0` that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
* **Recommendation:** If you experience OOM errors, try reducing this value to `0.7` or lower.
-#### `--backend pytorch`
-
- **Description:** Tells TensorRT LLM to use the **pytorch** backend.
-
#### `--max_batch_size`
* **Description:** The maximum number of user requests that can be grouped into a single batch for processing.
@@ -191,7 +186,7 @@ curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
```shell
curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
@@ -202,7 +197,7 @@ curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -
}'
```
-Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
+Here is an example response, showing that the TensorRT LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
```json
{"id":"cmpl-bc1393d529ce485c961d9ffee5b25d72","object":"text_completion","created":1753843963,"model":"$MODEL","choices":[{"index":0,"text":" New York is a state located in the northeastern United States. It is bordered by","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null}
@@ -304,7 +299,7 @@ If you want to save the results to a file add the following options.
--result-filename "concurrency_${concurrency}.json"
```
-For more benchmarking options see .
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
Run bench.sh to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above bench.sh script.
@@ -345,13 +340,41 @@ P99 E2EL (ms): [result]
### Key Metrics
-* Median Time to First Token (TTFT)
+#### Time to First Token (TTFT)
* The typical time elapsed from when a request is sent until the first output token is generated.
-* Median Time Per Output Token (TPOT)
- * The typical time required to generate each token *after* the first one.
-* Median Inter-Token Latency (ITL)
- * The typical time delay between the completion of one token and the completion of the next.
-* Median End-to-End Latency (E2EL)
+
+#### Time Per Output Token (TPOT) and Inter-Token Latency (ITL)
+ * TPOT is the typical time required to generate each token *after* the first one.
+ * ITL is the typical time delay between the completion of one token and the completion of the next.
+ * Both TPOT and ITL ignore TTFT.
+
+For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
+
+```math
+\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
+```
+
+Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
+
+```math
+\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
+```
+
+```math
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
+```
+
+#### End-to-End (E2E) Latency
* The typical total time from when a request is submitted until the final token of the response is received.
-* Total Token Throughput
+
+#### Total Token Throughput
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
+```math
+\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
+
+#### Tokens Per Second (TPS) or Output Token Throughput
+ * how many output tokens the system generates each second.
+```math
+\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
+```
diff --git a/latest/_sources/deployment-guide/quick-start-recipe-for-qwen3-next-on-trtllm.md.txt b/latest/_sources/deployment-guide/quick-start-recipe-for-qwen3-next-on-trtllm.md.txt
new file mode 100644
index 0000000000..ce192b9f5f
--- /dev/null
+++ b/latest/_sources/deployment-guide/quick-start-recipe-for-qwen3-next-on-trtllm.md.txt
@@ -0,0 +1,237 @@
+# Quick Start Recipe for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware
+
+## Introduction
+
+This is a functional quick-start guide for running the Qwen3-Next model on TensorRT LLM. It focuses on a working setup with recommended defaults. Additional performance optimizations and support will be rolled out in future updates.
+
+## Prerequisites
+
+* GPU: NVIDIA Blackwell or Hopper Architecture
+* OS: Linux
+* Drivers: CUDA Driver 575 or Later
+* Docker with NVIDIA Container Toolkit installed
+* Python3 and python3-pip (Optional, for accuracy evaluation only)
+
+## Models
+
+* BF16 model: [Qwen3-Next-80B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking)
+
+## Deployment Steps
+
+### Run Docker Container
+
+Build and run the docker container. See the [Docker guide](../../../docker/README.md) for details.
+```
+cd TensorRT-LLM
+
+make -C docker release_build IMAGE_TAG=qwen3-next-local
+
+make -C docker release_run IMAGE_NAME=tensorrt_llm IMAGE_TAG=qwen3-next-local LOCAL_USER=1
+```
+
+### Creating the TensorRT LLM Server config
+
+We create a YAML configuration file `/tmp/config.yml` for the TensorRT LLM Server with the following content:
+
+```shell
+EXTRA_LLM_API_FILE=/tmp/config.yml
+
+cat << EOF > ${EXTRA_LLM_API_FILE}
+enable_attention_dp: false
+cuda_graph_config:
+ enable_padding: true
+ max_batch_size: 720
+moe_config:
+ backend: TRTLLM
+stream_interval: 20
+num_postprocess_workers: 4
+kv_cache_config:
+ enable_block_reuse: false
+ free_gpu_memory_fraction: 0.6
+EOF
+```
+
+
+### Launch the TensorRT LLM Server
+
+Below is an example command to launch the TensorRT LLM server with the Qwen3-Next model from within the container.
+
+```shell
+trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking \
+ --host 0.0.0.0 \
+ --port 8000 \
+ --max_batch_size 16 \
+ --max_num_tokens 4096 \
+ --tp_size 4 \
+ --pp_size 1 \
+ --ep_size 4 \
+ --trust_remote_code \
+ --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+```
+
+After the server is set up, the client can now send prompt requests to the server and receive results.
+
+### Configs and Parameters
+
+These options are used directly on the command line when you start the `trtllm-serve` process.
+
+#### `--tp_size`
+
+* **Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance.
+
+#### `--ep_size`
+
+* **Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tp_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models.
+
+#### `--kv_cache_free_gpu_memory_fraction`
+
+* **Description:** A value between `0.0` and `1.0` that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
+* **Recommendation:** If you experience OOM errors, try reducing this value to `0.7` or lower.
+
+
+#### `--max_batch_size`
+
+* **Description:** The maximum number of user requests that can be grouped into a single batch for processing. The actual max batch size that can be achieved depends on total sequence length (input + output).
+
+#### `--max_num_tokens`
+
+* **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch.
+
+#### `--max_seq_len`
+
+* **Description:** The maximum possible sequence length for a single request, including both input and generated output tokens. We won't specifically set it. It will be inferred from model config.
+
+#### `--trust_remote_code`
+
+* **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
+
+
+#### Extra LLM API Options (YAML Configuration)
+
+These options provide finer control over performance and are set within a YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+
+#### `cuda_graph_config`
+
+* **Description**: A section for configuring CUDA graphs to optimize performance.
+
+* **Options**:
+
+ * `enable_padding`: If `"true"`, input batches are padded to the nearest `cuda_graph_batch_size`. This can significantly improve performance.
+
+ **Default**: `false`
+
+ * `max_batch_size`: Sets the maximum batch size for which a CUDA graph will be created.
+
+ **Default**: `0`
+
+ **Recommendation**: Set this to the same value as the `--max_batch_size` command-line option.
+
+#### `moe_config`
+
+* **Description**: Configuration for Mixture-of-Experts (MoE) models.
+
+* **Options**:
+
+ * `backend`: The backend to use for MoE operations.
+ **Default**: `CUTLASS`
+
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+
+## Testing API Endpoint
+
+### Basic Test
+
+Start a new terminal on the host to test the TensorRT LLM server you just launched.
+
+You can query the health/readiness of the server using:
+
+```shell
+curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
+```
+
+When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
+
+After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
+
+```shell
+curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+ "model": "Qwen/Qwen3-Next-80B-A3B-Thinking",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Where is New York?"
+ }
+ ],
+ "max_tokens": 1024,
+ "top_p": 1.0
+}' -w "\n"
+```
+
+Here is an example response:
+
+```
+{"id":"chatcmpl-64ac201c77bf46a7a3a4eca7759b1fd8","object":"chat.completion","created":1759022940,"model":"Qwen/Qwen3-Next-80B-A3B-Thinking","choices":[{"index":0,"message":{"role":"assistant","content":"Okay, the user is asking \"Where is New York?\" Hmm, this seems straightforward but I need to be careful. New York could mean different things—maybe they're confused about the city versus the state. \n\nFirst thought: Are they a tourist planning a trip? Or maybe a student doing homework? Could even be someone国外 who's only heard \"New York\" in movies and isn't sure if it's a city or state. \n\nI should clarify both possibilities immediately. People often mix them up. Like, if someone says \"I'm going to New York\" they're probably talking about NYC, but technically New York State is bigger. \n\nLet me break it down: \n- New York City (NYC) is the famous one—Manhattan, skyscrapers, Times Square. \n- Then New York State (NY) is the whole state, which includes NYC but also upstate areas like Albany (the capital), Buffalo, and even the Adirondacks. \n\nWait, should I mention that NYC is in New York State? Yeah, that's crucial. Otherwise they might think it's two separate things. Also, where is the state located? Northeast US, borders other states like Pennsylvania, New Jersey... maybe name a few neighbors for context. \n\nOh! And the city vs state confusion is super common. Like, people say \"New York\" for the city but forget it's part of a larger state. Should I give examples? \n- \"If someone says 'hot dogs in New York' they mean NYC\" \n- But \"the New York State Police\" covers the whole state\" \n\n...Maybe add a fun fact? Like how NYC is just one city within the state? Or that the state capital isn't NYC but Albany? Yeah, that'll help clear confusion. \n\nAlso—should I ask if they meant the city or the state? Nah, better to cover both upfront. Keep it simple but precise. No jargon. They might not know terms like \"metropolitan area\" or \"boroughs,\" so explain \"boroughs\" as neighborhoods (Manhattan, Brooklyn etc.). \n\nCoordinates: 40.7°N, 74°W? Maybe overkill unless they're into geography. But for the state, roughly 42°N to 45°N latitude? Hmm, probably unnecessary. Stick to landmarks: borders Lake Erie, Atlantic coast... \n\nWait—user might be asking literally for GPS coordinates? Unlikely, but just in case I can add: \"Geographically, New York City is at approximately 40.7°N latitude and 74.0°W longitude...\" but only if the answer feels too dry otherwise. \n\nPriority: Clear, simple, correct. No assuming prior knowledge. Say \"New York can refer to...\" then split into two cases. End with \"Most people mean the city when they say 'New York' in casual talk.\" That’s the key takeaway. \n\n...Double-check: Is Albany really the capital? Yes. And NYC is the largest city but not capital. Got it. Also, New York State is the 27th largest by area? Maybe not relevant. Stick to basics. \n\nFinal structure: \n1. \"New York can mean two things: the city or the state.\" \n2. For city: \n - Where it is (Northeast US) \n - Inside New York State \n - Famous parts (Manhattan etc.) \n3. For state: \n - Larger area, includes city \n - Borders (PA, NJ, CT, MA, VT, Canada) \n - Capital is Albany (not NYC!) \n4. Clear summary: \"In everyday talk, 'New York' usually means NYC.\" \n\nYeah. That covers it without overwhelming them. No markdown, just plain English. Keep it warm—add a \"hope that helps!\" at the end.\n\n\nThat's a common point of confusion! **\"New York\" can refer to two different things**, and it's important to distinguish between them:\n\n### 1. **New York City (NYC)** \n - **What it is**: The largest and most famous city in the United States, known for landmarks like Times Square, Central Park, the Statue of Liberty, and Wall Street. \n - **Where it is**: \n - Located in the **northeastern United States**. \n - Situated at the mouth of the **Hudson River**, where it meets the **Atlantic Ocean**. \n - Part of **New York State** (see below). \n - **Geographic details**: \n - Coordinates: Approximately **40.7° N latitude, 74.0° W longitude**. \n - Composed of **5 boroughs**: Manhattan (the \"city\" most people picture), Brooklyn, Queens, The Bronx, and Staten Island. \n - Panoramic view of NYC (including Brooklyn and New Jersey skyline):","reasoning_content":null,"reasoning":null,"tool_calls":[]},"logprobs":null,"finish_reason":"length","stop_reason":null,"mm_embedding_handle":null,"disaggregated_params":null,"avg_decoded_tokens_per_iter":1.0}],"usage":{"prompt_tokens":15,"total_tokens":1039,"completion_tokens":1024},"prompt_token_ids":null}
+```
+
+### Troubleshooting Tips
+
+* If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`.
+* Ensure your model checkpoints are compatible with the expected format.
+* For performance issues, check GPU utilization with nvidia-smi while the server is running.
+* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed.
+* For connection issues, make sure the server port (`8000` in this guide) is not being used by another application.
+
+
+
+## Benchmarking Performance
+
+To benchmark the performance of your TensorRT LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script.
+
+```shell
+cat <<'EOF' > bench.sh
+#!/usr/bin/env bash
+set -euo pipefail
+
+concurrency_list="1 2 4 8 16 32 64 128 256"
+multi_round=5
+isl=1024
+osl=1024
+result_dir=/tmp/qwen3_output
+
+for concurrency in ${concurrency_list}; do
+ num_prompts=$((concurrency * multi_round))
+ python -m tensorrt_llm.serve.scripts.benchmark_serving \
+ --model Qwen/Qwen3-Next-80B-A3B-Thinking \
+ --backend openai \
+ --dataset-name "random" \
+ --random-input-len ${isl} \
+ --random-output-len ${osl} \
+ --random-prefix-len 0 \
+ --random-ids \
+ --num-prompts ${num_prompts} \
+ --max-concurrency ${concurrency} \
+ --ignore-eos \
+ --tokenize-on-client \
+ --percentile-metrics "ttft,tpot,itl,e2el"
+done
+EOF
+chmod +x bench.sh
+```
+
+To achieve max through-put, with attention DP on, one needs to sweep up to `concurrency = max_batch_size * num_gpus`.
+
+If you want to save the results to a file add the following options.
+
+```shell
+--save-result \
+--result-dir "${result_dir}" \
+--result-filename "concurrency_${concurrency}.json"
+```
+
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
+
+Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
+
+```shell
+./bench.sh
+```
diff --git a/latest/_sources/examples/curl_chat_client.rst.txt b/latest/_sources/examples/curl_chat_client.rst.txt
index 18bcb39fd4..a7ac7d33bd 100644
--- a/latest/_sources/examples/curl_chat_client.rst.txt
+++ b/latest/_sources/examples/curl_chat_client.rst.txt
@@ -2,7 +2,7 @@ Curl Chat Client
================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/curl_chat_client.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/curl_chat_client.sh.
.. literalinclude:: ../../../examples/serve/curl_chat_client.sh
:lines: 1-11
diff --git a/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt b/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt
index d1eaddadf4..6733ff8537 100644
--- a/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt
+++ b/latest/_sources/examples/curl_chat_client_for_multimodal.rst.txt
@@ -2,7 +2,7 @@ Curl Chat Client For Multimodal
===============================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/curl_chat_client_for_multimodal.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/curl_chat_client_for_multimodal.sh.
.. literalinclude:: ../../../examples/serve/curl_chat_client_for_multimodal.sh
:lines: 1-88
diff --git a/latest/_sources/examples/curl_completion_client.rst.txt b/latest/_sources/examples/curl_completion_client.rst.txt
index db57b5fc9a..f238f00a56 100644
--- a/latest/_sources/examples/curl_completion_client.rst.txt
+++ b/latest/_sources/examples/curl_completion_client.rst.txt
@@ -2,7 +2,7 @@ Curl Completion Client
======================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/curl_completion_client.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/curl_completion_client.sh.
.. literalinclude:: ../../../examples/serve/curl_completion_client.sh
:lines: 1-10
diff --git a/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt b/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt
index 3dafbd5b2d..107eda1bc5 100644
--- a/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt
+++ b/latest/_sources/examples/deepseek_r1_reasoning_parser.rst.txt
@@ -2,7 +2,7 @@ Deepseek R1 Reasoning Parser
============================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/deepseek_r1_reasoning_parser.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/deepseek_r1_reasoning_parser.sh.
.. literalinclude:: ../../../examples/serve/deepseek_r1_reasoning_parser.sh
:lines: 1-10
diff --git a/latest/_sources/examples/genai_perf_client.rst.txt b/latest/_sources/examples/genai_perf_client.rst.txt
index 000dd2db6b..acbb077810 100644
--- a/latest/_sources/examples/genai_perf_client.rst.txt
+++ b/latest/_sources/examples/genai_perf_client.rst.txt
@@ -2,7 +2,7 @@ Genai Perf Client
=================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/genai_perf_client.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/genai_perf_client.sh.
.. literalinclude:: ../../../examples/serve/genai_perf_client.sh
:lines: 1-16
diff --git a/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt b/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt
index d9c10bf811..004b4e1604 100644
--- a/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt
+++ b/latest/_sources/examples/genai_perf_client_for_multimodal.rst.txt
@@ -2,7 +2,7 @@ Genai Perf Client For Multimodal
================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/genai_perf_client_for_multimodal.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/genai_perf_client_for_multimodal.sh.
.. literalinclude:: ../../../examples/serve/genai_perf_client_for_multimodal.sh
:lines: 1-19
diff --git a/latest/_sources/examples/llm_api_examples.rst.txt b/latest/_sources/examples/llm_api_examples.rst.txt
index 23121ec2c3..7d78555f92 100644
--- a/latest/_sources/examples/llm_api_examples.rst.txt
+++ b/latest/_sources/examples/llm_api_examples.rst.txt
@@ -21,6 +21,7 @@ _____________
llm_guided_decoding
llm_logits_processor
llm_multilora
+ llm_sparse_attention
llm_speculative_decoding
llm_kv_cache_connector
llm_kv_cache_offloading
diff --git a/latest/_sources/examples/llm_guided_decoding.rst.txt b/latest/_sources/examples/llm_guided_decoding.rst.txt
index baa2bd504a..c3e955be75 100644
--- a/latest/_sources/examples/llm_guided_decoding.rst.txt
+++ b/latest/_sources/examples/llm_guided_decoding.rst.txt
@@ -1,6 +1,6 @@
Generate text with guided decoding
==================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_guided_decoding.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_guided_decoding.py.
.. literalinclude:: ../../../examples/llm-api/llm_guided_decoding.py
:lines: 4-47
diff --git a/latest/_sources/examples/llm_inference.rst.txt b/latest/_sources/examples/llm_inference.rst.txt
index 21dded0685..a8033d5ebd 100644
--- a/latest/_sources/examples/llm_inference.rst.txt
+++ b/latest/_sources/examples/llm_inference.rst.txt
@@ -1,6 +1,6 @@
Generate text
=============
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_inference.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_inference.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference.py
:lines: 4-35
diff --git a/latest/_sources/examples/llm_inference_async.rst.txt b/latest/_sources/examples/llm_inference_async.rst.txt
index acfdad4f3d..95c177a5bf 100644
--- a/latest/_sources/examples/llm_inference_async.rst.txt
+++ b/latest/_sources/examples/llm_inference_async.rst.txt
@@ -1,6 +1,6 @@
Generate text asynchronously
============================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_inference_async.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_inference_async.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference_async.py
:lines: 4-43
diff --git a/latest/_sources/examples/llm_inference_async_streaming.rst.txt b/latest/_sources/examples/llm_inference_async_streaming.rst.txt
index 96468d0082..63d352afcb 100644
--- a/latest/_sources/examples/llm_inference_async_streaming.rst.txt
+++ b/latest/_sources/examples/llm_inference_async_streaming.rst.txt
@@ -1,6 +1,6 @@
Generate text in streaming
==========================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_inference_async_streaming.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_inference_async_streaming.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference_async_streaming.py
:lines: 4-64
diff --git a/latest/_sources/examples/llm_inference_distributed.rst.txt b/latest/_sources/examples/llm_inference_distributed.rst.txt
index c9f12eaef8..6ebbd8c0fb 100644
--- a/latest/_sources/examples/llm_inference_distributed.rst.txt
+++ b/latest/_sources/examples/llm_inference_distributed.rst.txt
@@ -1,6 +1,6 @@
Distributed LLM Generation
==========================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_inference_distributed.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_inference_distributed.py.
.. literalinclude:: ../../../examples/llm-api/llm_inference_distributed.py
:lines: 4-44
diff --git a/latest/_sources/examples/llm_kv_cache_connector.rst.txt b/latest/_sources/examples/llm_kv_cache_connector.rst.txt
index a95755fb11..42be662217 100644
--- a/latest/_sources/examples/llm_kv_cache_connector.rst.txt
+++ b/latest/_sources/examples/llm_kv_cache_connector.rst.txt
@@ -1,6 +1,6 @@
KV Cache Connector
==================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_kv_cache_connector.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_kv_cache_connector.py.
.. literalinclude:: ../../../examples/llm-api/llm_kv_cache_connector.py
:lines: 4-247
diff --git a/latest/_sources/examples/llm_kv_cache_offloading.rst.txt b/latest/_sources/examples/llm_kv_cache_offloading.rst.txt
index 0a8c5a0ad5..2a082ca896 100644
--- a/latest/_sources/examples/llm_kv_cache_offloading.rst.txt
+++ b/latest/_sources/examples/llm_kv_cache_offloading.rst.txt
@@ -1,6 +1,6 @@
KV Cache Offloading
===================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_kv_cache_offloading.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_kv_cache_offloading.py.
.. literalinclude:: ../../../examples/llm-api/llm_kv_cache_offloading.py
:lines: 4-134
diff --git a/latest/_sources/examples/llm_logits_processor.rst.txt b/latest/_sources/examples/llm_logits_processor.rst.txt
index 2bf646af07..0e994fc1e1 100644
--- a/latest/_sources/examples/llm_logits_processor.rst.txt
+++ b/latest/_sources/examples/llm_logits_processor.rst.txt
@@ -1,6 +1,6 @@
Control generated text using logits processor
=============================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_logits_processor.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_logits_processor.py.
.. literalinclude:: ../../../examples/llm-api/llm_logits_processor.py
:lines: 4-128
diff --git a/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt b/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt
index de91a150be..ac6efb4519 100644
--- a/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt
+++ b/latest/_sources/examples/llm_mgmn_llm_distributed.rst.txt
@@ -1,6 +1,6 @@
Run LLM-API with pytorch backend on Slurm
=========================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_mgmn_llm_distributed.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_mgmn_llm_distributed.sh.
.. literalinclude:: ../../../examples/llm-api/llm_mgmn_llm_distributed.sh
:lines: 1-10,14-55
diff --git a/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt b/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt
index 172cf342f3..98969e3d0f 100644
--- a/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt
+++ b/latest/_sources/examples/llm_mgmn_trtllm_bench.rst.txt
@@ -1,6 +1,6 @@
Run trtllm-bench with pytorch backend on Slurm
==============================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_mgmn_trtllm_bench.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_mgmn_trtllm_bench.sh.
.. literalinclude:: ../../../examples/llm-api/llm_mgmn_trtllm_bench.sh
:lines: 1-10,14-95
diff --git a/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt b/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt
index c8a30174b2..c8466b49ea 100644
--- a/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt
+++ b/latest/_sources/examples/llm_mgmn_trtllm_serve.rst.txt
@@ -1,6 +1,6 @@
Run trtllm-serve with pytorch backend on Slurm
==============================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_mgmn_trtllm_serve.sh.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_mgmn_trtllm_serve.sh.
.. literalinclude:: ../../../examples/llm-api/llm_mgmn_trtllm_serve.sh
:lines: 1-10,14-56
diff --git a/latest/_sources/examples/llm_multilora.rst.txt b/latest/_sources/examples/llm_multilora.rst.txt
index 558f0073b4..3062bfbfc8 100644
--- a/latest/_sources/examples/llm_multilora.rst.txt
+++ b/latest/_sources/examples/llm_multilora.rst.txt
@@ -1,6 +1,6 @@
Generate text with multiple LoRA adapters
=========================================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_multilora.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_multilora.py.
.. literalinclude:: ../../../examples/llm-api/llm_multilora.py
:lines: 4-89
diff --git a/latest/_sources/examples/llm_runtime.rst.txt b/latest/_sources/examples/llm_runtime.rst.txt
index acc8259803..d09fb465e6 100644
--- a/latest/_sources/examples/llm_runtime.rst.txt
+++ b/latest/_sources/examples/llm_runtime.rst.txt
@@ -1,6 +1,6 @@
Runtime Configuration Examples
==============================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_runtime.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_runtime.py.
.. literalinclude:: ../../../examples/llm-api/llm_runtime.py
:lines: 4-96
diff --git a/latest/_sources/examples/llm_sampling.rst.txt b/latest/_sources/examples/llm_sampling.rst.txt
index c380fcf20b..472f5ad5ca 100644
--- a/latest/_sources/examples/llm_sampling.rst.txt
+++ b/latest/_sources/examples/llm_sampling.rst.txt
@@ -1,6 +1,6 @@
Sampling Techniques Showcase
============================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_sampling.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_sampling.py.
.. literalinclude:: ../../../examples/llm-api/llm_sampling.py
:lines: 4-229
diff --git a/latest/_sources/examples/llm_sparse_attention.rst.txt b/latest/_sources/examples/llm_sparse_attention.rst.txt
new file mode 100644
index 0000000000..6c69a097cd
--- /dev/null
+++ b/latest/_sources/examples/llm_sparse_attention.rst.txt
@@ -0,0 +1,8 @@
+Sparse Attention
+================
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_sparse_attention.py.
+
+.. literalinclude:: ../../../examples/llm-api/llm_sparse_attention.py
+ :lines: 4-155
+ :language: python
+ :linenos:
diff --git a/latest/_sources/examples/llm_speculative_decoding.rst.txt b/latest/_sources/examples/llm_speculative_decoding.rst.txt
index f774aca5b4..f527ec5ce0 100644
--- a/latest/_sources/examples/llm_speculative_decoding.rst.txt
+++ b/latest/_sources/examples/llm_speculative_decoding.rst.txt
@@ -1,6 +1,6 @@
Speculative Decoding
====================
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/llm-api/llm_speculative_decoding.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/llm-api/llm_speculative_decoding.py.
.. literalinclude:: ../../../examples/llm-api/llm_speculative_decoding.py
:lines: 4-95
diff --git a/latest/_sources/examples/openai_chat_client.rst.txt b/latest/_sources/examples/openai_chat_client.rst.txt
index a897b9ddfd..95b81e296f 100644
--- a/latest/_sources/examples/openai_chat_client.rst.txt
+++ b/latest/_sources/examples/openai_chat_client.rst.txt
@@ -2,7 +2,7 @@ OpenAI Chat Client
==================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/openai_chat_client.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/openai_chat_client.py.
.. literalinclude:: ../../../examples/serve/openai_chat_client.py
:lines: 2-21
diff --git a/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt b/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt
index 810d27d646..5c88c9503f 100644
--- a/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt
+++ b/latest/_sources/examples/openai_chat_client_for_multimodal.rst.txt
@@ -2,7 +2,7 @@ OpenAI Chat Client for Multimodal
=================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/openai_chat_client_for_multimodal.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/openai_chat_client_for_multimodal.py.
.. literalinclude:: ../../../examples/serve/openai_chat_client_for_multimodal.py
:lines: 2-114
diff --git a/latest/_sources/examples/openai_completion_client.rst.txt b/latest/_sources/examples/openai_completion_client.rst.txt
index ffc0cea191..df4ab76576 100644
--- a/latest/_sources/examples/openai_completion_client.rst.txt
+++ b/latest/_sources/examples/openai_completion_client.rst.txt
@@ -2,7 +2,7 @@ OpenAI Completion Client
========================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/openai_completion_client.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/openai_completion_client.py.
.. literalinclude:: ../../../examples/serve/openai_completion_client.py
:lines: 2-15
diff --git a/latest/_sources/examples/openai_completion_client_for_lora.rst.txt b/latest/_sources/examples/openai_completion_client_for_lora.rst.txt
index b4449ae4c1..da70fefb76 100644
--- a/latest/_sources/examples/openai_completion_client_for_lora.rst.txt
+++ b/latest/_sources/examples/openai_completion_client_for_lora.rst.txt
@@ -2,7 +2,7 @@ Openai Completion Client For Lora
=================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/openai_completion_client_for_lora.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/openai_completion_client_for_lora.py.
.. literalinclude:: ../../../examples/serve/openai_completion_client_for_lora.py
:lines: 1-30
diff --git a/latest/_sources/examples/openai_completion_client_json_schema.rst.txt b/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
index b802f5921e..c684fc016d 100644
--- a/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
+++ b/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
@@ -2,7 +2,7 @@ OpenAI Completion Client with JSON Schema
=========================================
Refer to the `trtllm-serve documentation `_ for starting a server.
-Source https://github.com/NVIDIA/TensorRT-LLM/blob/560ded5450b79efde720162fc397d7efa59aae6d/examples/serve/openai_completion_client_json_schema.py.
+Source https://github.com/NVIDIA/TensorRT-LLM/blob/796891ba2a6959bad58c0da9645416c7264349e9/examples/serve/openai_completion_client_json_schema.py.
.. literalinclude:: ../../../examples/serve/openai_completion_client_json_schema.py
:lines: 2-52
diff --git a/latest/_sources/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt b/latest/_sources/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt
index 1bbf2d1bd8..d5e0cde8f2 100644
--- a/latest/_sources/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt
+++ b/latest/_sources/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt
@@ -40,29 +40,31 @@ trtllm-bench \
#### Basic Performance Configuration (`autodeploy_config.yaml`)
```yaml
-# Compilation backend
-compile_backend: torch-opt
-
-# Runtime engine
+# runtime engine
runtime: trtllm
-# Model loading
+# model loading
skip_loading_weights: false
-# Fraction of free memory to use for kv-caches
-free_mem_ratio: 0.8
-
-# CUDA Graph optimization
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
-
-# Attention backend
-attn_backend: flashinfer
-
# Sequence configuration
max_batch_size: 256
+
+# transform options
+transforms:
+ insert_cached_attention:
+ # attention backend
+ backend: flashinfer
+ resize_kv_cache:
+ # fraction of free memory to use for kv-caches
+ free_mem_ratio: 0.8
+ compile_model:
+ # compilation backend
+ backend: torch-opt
+ # CUDA Graph optimization
+ cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
```
-Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs
+Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs.
## Configuration Options Reference
diff --git a/latest/_sources/features/auto_deploy/advanced/expert_configurations.md.txt b/latest/_sources/features/auto_deploy/advanced/expert_configurations.md.txt
index 60cfd197a9..471bd21391 100644
--- a/latest/_sources/features/auto_deploy/advanced/expert_configurations.md.txt
+++ b/latest/_sources/features/auto_deploy/advanced/expert_configurations.md.txt
@@ -63,15 +63,15 @@ args:
num_hidden_layers: 12
hidden_size: 1024
world_size: 4
- compile_backend: torch-compile
- attn_backend: triton
max_seq_len: 2048
max_batch_size: 16
transforms:
- sharding:
- strategy: auto
- quantization:
- enabled: false
+ detect_sharding:
+ support_partial_config: true
+ insert_cached_attention:
+ backend: triton
+ compile_model:
+ backend: torch-compile
prompt:
batch_size: 8
@@ -79,13 +79,6 @@ prompt:
max_tokens: 150
temperature: 0.8
top_k: 50
-
-benchmark:
- enabled: true
- num: 20
- bs: 4
- isl: 1024
- osl: 256
```
Create an additional override file (e.g., `production.yaml`):
@@ -94,11 +87,10 @@ Create an additional override file (e.g., `production.yaml`):
# production.yaml
args:
world_size: 8
- compile_backend: torch-opt
max_batch_size: 32
-
-benchmark:
- enabled: false
+ transforms:
+ compile_model:
+ backend: torch-opt
```
Then use these configurations:
@@ -107,18 +99,18 @@ Then use these configurations:
# Using single YAML config
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs my_config.yaml
+ --yaml-extra my_config.yaml
# Using multiple YAML configs (deep merged in order, later files have higher priority)
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs my_config.yaml production.yaml
+ --yaml-extra my_config.yaml production.yaml
# Targeting nested AutoDeployConfig with separate YAML
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs my_config.yaml \
- --args.yaml-configs autodeploy_overrides.yaml
+ --yaml-extra my_config.yaml \
+ --args.yaml-extra autodeploy_overrides.yaml
```
## Configuration Precedence and Deep Merging
@@ -126,7 +118,7 @@ python build_and_run_ad.py \
The configuration system follows a precedence order in which higher priority sources override lower priority ones:
1. **CLI Arguments** (highest priority) - Direct command line arguments
-1. **YAML Configs** - Files specified via `--yaml-configs` and `--args.yaml-configs`
+1. **YAML Configs** - Files specified via `--yaml-extra` and `--args.yaml-extra`
1. **Default Settings** (lowest priority) - Built-in defaults from the config classes
**Deep Merging**: Unlike simple overwriting, deep merging recursively combines nested dictionaries. For example:
@@ -152,12 +144,12 @@ args:
**Nested Config Behavior**: When using nested configurations, outer YAML configuration files become initialization settings for inner objects, giving them higher precedence:
```bash
-# The outer yaml-configs affects the entire ExperimentConfig
-# The inner args.yaml-configs affects only the AutoDeployConfig
+# The outer yaml-extra affects the entire ExperimentConfig
+# The inner args.yaml-extra affects only the AutoDeployConfig
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs experiment_config.yaml \
- --args.yaml-configs autodeploy_config.yaml \
+ --yaml-extra experiment_config.yaml \
+ --args.yaml-extra autodeploy_config.yaml \
--args.world-size=8 # CLI override beats both YAML configs
```
diff --git a/latest/_sources/features/auto_deploy/advanced/workflow.md.txt b/latest/_sources/features/auto_deploy/advanced/workflow.md.txt
index 191fa6f276..f1bd715029 100644
--- a/latest/_sources/features/auto_deploy/advanced/workflow.md.txt
+++ b/latest/_sources/features/auto_deploy/advanced/workflow.md.txt
@@ -18,9 +18,7 @@ llm = LLM(
attn_page_size=64, # page size for attention (tokens_per_block, should be == max_seq_len for triton)
skip_loading_weights=False,
model_factory="AutoModelForCausalLM", # choose appropriate model factory
- mla_backend="MultiHeadLatentAttention", # for models that support MLA
free_mem_ratio=0.8, # fraction of available memory for cache
- simple_shard_only=False, # tensor parallelism sharding strategy
max_seq_len=,
max_batch_size=,
)
diff --git a/latest/_sources/features/auto_deploy/support_matrix.md.txt b/latest/_sources/features/auto_deploy/support_matrix.md.txt
index c8780cbca1..a41090932f 100644
--- a/latest/_sources/features/auto_deploy/support_matrix.md.txt
+++ b/latest/_sources/features/auto_deploy/support_matrix.md.txt
@@ -113,6 +113,7 @@ Optimize attention operations with different attention kernel implementations:
| `"attn_backend"` | Description |
|----------------------|-------------|
+| `torch` | Custom fused multi-head attention (MHA) with KV Cache reference implementation in pure PyTorch (slow!) |
| `triton` | Custom fused multi-head attention (MHA) with KV Cache kernels for efficient attention processing. |
| `flashinfer` | Uses optimized attention kernels with KV Cache from the [`flashinfer`](https://github.com/flashinfer-ai/flashinfer.git) library. |
diff --git a/latest/_sources/features/disagg-serving.md.txt b/latest/_sources/features/disagg-serving.md.txt
index 8af2c188a5..a6335b9c92 100644
--- a/latest/_sources/features/disagg-serving.md.txt
+++ b/latest/_sources/features/disagg-serving.md.txt
@@ -106,7 +106,7 @@ cache_transceiver_config:
max_tokens_in_buffer:
```
-`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is UCX.
+`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is NIXL.
`max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance.
diff --git a/latest/_sources/features/feature-combination-matrix.md.txt b/latest/_sources/features/feature-combination-matrix.md.txt
index 22410574ec..0a392c4f5d 100644
--- a/latest/_sources/features/feature-combination-matrix.md.txt
+++ b/latest/_sources/features/feature-combination-matrix.md.txt
@@ -9,11 +9,11 @@
| Chunked Prefill | Yes | Yes | Yes | Yes | --- | | | | | | | | | | |
| MTP | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | | | |
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | | |
-| EAGLE-3(Two Model Engine) | No | Yes | Yes | Yes | Yes | No | No | --- | | | | | | | |
+| EAGLE-3(Two Model Engine) | Yes | Yes | Yes | Yes | Yes | No | No | --- | | | | | | | |
| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | |
| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | | |
| KV Cache Reuse | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | |
| Slide Window Attention | Yes | Yes | Yes | Yes | Yes | No | Untested | Untested | Yes | Yes | WIP | --- | | | |
| Logits Post Processor | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | --- | | |
-| Guided Decoding | Yes | Yes | Yes | Yes | Yes | No | No | Yes | Yes | Yes | Yes | Yes | Yes | --- | |
+| Guided Decoding | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | |
| LoRA | Yes | No | Untested | Untested | Untested | Untested | Untested | Untested | Yes | Yes | Yes | Yes | Yes | Untested | --- |
diff --git a/latest/_sources/features/ray-orchestrator.md.txt b/latest/_sources/features/ray-orchestrator.md.txt
new file mode 100644
index 0000000000..4984c180b7
--- /dev/null
+++ b/latest/_sources/features/ray-orchestrator.md.txt
@@ -0,0 +1,42 @@
+# Ray Orchestrator (Prototype)
+
+```{note}
+This project is under active development and currently in a prototype stage. The current focus is on core functionality, with performance optimization coming soon. While we strive for correctness, there are currently no guarantees regarding functionality, stability, or reliability.
+```
+
+## Motivation
+The **Ray orchestrator** uses [Ray](https://docs.ray.io/en/latest/index.html) instead of MPI to manage workers for single- and multi-node inference. It’s a first step toward making TensorRT-LLM a better fit for Reinforcement Learning from Human Feedback (RLHF) workflows. For RLHF, Ray can dynamically spawn and reconnect distributed inference actors, each with its own parallelism strategy. This feature is a prototype and under active development. MPI remains the default in TensorRT-LLM.
+
+
+## Basic Usage
+To use Ray orchestrator, you need to first install Ray.
+```shell
+cd examples/ray_orchestrator
+pip install -r requirements.txt
+```
+
+To run a simple `TP=2` example with a Hugging Face model:
+
+```shell
+python llm_inference_distributed_ray.py
+```
+
+This example is the same as in `/examples/llm-api`, with the only change being `orchestrator_type="ray"` on `LLM()`. Other examples can be adapted similarly by toggling this flag.
+
+
+## Features
+Currently available:
+- Generate text asynchronously (refer to [llm_inference_async_ray.py](/examples/ray_orchestrator/llm_inference_async_ray.py))
+- Multi-node inference (refer to [multi-node README](/examples/ray_orchestrator/multi_nodes/README.md))
+- Disaggregated serving (refer to [disagg README](/examples/ray_orchestrator/disaggregated/README.md))
+
+*Initial testing has been focused on LLaMA and DeepSeek variants. Please open an Issue if you encounter problems with other models so we can prioritize support.*
+
+## Roadmap
+- Performance optimization
+- Integration with RLHF frameworks, such as [Verl](https://github.com/volcengine/verl) and [NVIDIA NeMo-RL](https://github.com/NVIDIA-NeMo/RL).
+
+## Architecture
+This feature introduces new classes such as [RayExecutor](/tensorrt_llm/executor/ray_executor.py) and [RayGPUWorker](/tensorrt_llm/executor/ray_gpu_worker.py) for Ray actor lifecycle management and distributed inference. In Ray mode, collective ops run on [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) without MPI. We welcome contributions to improve and extend this support.
+
+
diff --git a/latest/_sources/index.rst.txt b/latest/_sources/index.rst.txt
index 0389ebd489..c1fcd8ea78 100644
--- a/latest/_sources/index.rst.txt
+++ b/latest/_sources/index.rst.txt
@@ -73,6 +73,7 @@ Welcome to TensorRT LLM's Documentation!
features/speculative-decoding.md
features/checkpoint-loading.md
features/auto_deploy/auto-deploy.md
+ features/ray-orchestrator.md
.. toctree::
:maxdepth: 2
diff --git a/latest/_sources/installation/build-from-source-linux.md.txt b/latest/_sources/installation/build-from-source-linux.md.txt
index 19dab71c76..7b94aa8811 100644
--- a/latest/_sources/installation/build-from-source-linux.md.txt
+++ b/latest/_sources/installation/build-from-source-linux.md.txt
@@ -147,6 +147,11 @@ check .
## Build TensorRT LLM
+```{tip}
+:name: build-from-source-tip-cuda-version
+TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0 while some dependency changes are required. The `requirements.txt` contains dependencies needed by CUDA 13.0. If you are using CUDA 12.9, please uncomment lines end with `# ` and comment out the next lines.
+```
+
### Option 1: Full Build with C++ Compilation
The following command compiles the C++ code and packages the compiled libraries along with the Python files into a wheel. When developing C++ code, you need this full build command to apply your code changes.
diff --git a/latest/_sources/installation/linux.md.txt b/latest/_sources/installation/linux.md.txt
index 02a0cf7817..2cf211038d 100644
--- a/latest/_sources/installation/linux.md.txt
+++ b/latest/_sources/installation/linux.md.txt
@@ -12,11 +12,19 @@
Install CUDA Toolkit following the [CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) and
make sure `CUDA_HOME` environment variable is properly set.
+ ```{tip}
+ :name: installation-linux-tip-cuda-version
+ TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0. The wheel package release only supports CUDA 12.9, while CUDA 13.0 is only supported through NGC container release.
+ ```
+
```bash
# Optional step: Only required for NVIDIA Blackwell GPUs and SBSA platform
pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
sudo apt-get -y install libopenmpi-dev
+
+ # Optional step: Only required for disagg-serving
+ sudo apt-get -y install libzmq3-dev
```
PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs and SBSA platform. On prior GPUs or Linux x86_64 platform, this extra installation is not required.
diff --git a/latest/_sources/llm-api/reference.rst.txt b/latest/_sources/llm-api/reference.rst.txt
index cb440b45b5..13bc53d79f 100644
--- a/latest/_sources/llm-api/reference.rst.txt
+++ b/latest/_sources/llm-api/reference.rst.txt
@@ -288,7 +288,7 @@ API Reference
:special-members: __init__
:member-order: groupwise
:inherited-members:
- :exclude-members: json,validate,model_dump_json,parse_obj,model_extra,model_post_init,model_dump,copy,model_fields_set,model_copy,model_computed_fields,model_parametrized_name,model_validate_json,dict,model_construct,model_fields,model_config,parse_file,construct,schema,model_validate_strings,model_validate,schema_json,model_rebuild,from_orm,update_forward_refs,parse_raw,model_json_schema
+ :exclude-members: model_config,model_rebuild,parse_raw,model_copy,parse_file,json,validate,model_validate,model_json_schema,copy,model_construct,model_fields,schema,update_forward_refs,model_computed_fields,model_validate_json,parse_obj,model_dump_json,model_validate_strings,model_extra,model_dump,from_orm,dict,model_fields_set,model_parametrized_name,model_post_init,schema_json,construct
.. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs
:members:
@@ -297,7 +297,7 @@ API Reference
:special-members: __init__
:member-order: groupwise
:inherited-members:
- :exclude-members: json,validate,model_dump_json,parse_obj,model_extra,model_post_init,model_dump,copy,model_fields_set,model_copy,model_computed_fields,model_parametrized_name,model_validate_json,dict,model_construct,model_fields,model_config,parse_file,construct,schema,model_validate_strings,model_validate,schema_json,model_rebuild,from_orm,update_forward_refs,parse_raw,model_json_schema
+ :exclude-members: model_config,model_rebuild,parse_raw,model_copy,parse_file,json,validate,model_validate,model_json_schema,copy,model_construct,model_fields,schema,update_forward_refs,model_computed_fields,model_validate_json,parse_obj,model_dump_json,model_validate_strings,model_extra,model_dump,from_orm,dict,model_fields_set,model_parametrized_name,model_post_init,schema_json,construct
.. autoclass:: tensorrt_llm.llmapi.AutoDecodingConfig
:members:
@@ -323,3 +323,19 @@ API Reference
:member-order: groupwise
:inherited-members:
+.. autoclass:: tensorrt_llm.llmapi.SaveHiddenStatesDecodingConfig
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :special-members: __init__
+ :member-order: groupwise
+ :inherited-members:
+
+.. autoclass:: tensorrt_llm.llmapi.RocketSparseAttentionConfig
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :special-members: __init__
+ :member-order: groupwise
+ :inherited-members:
+
diff --git a/latest/_sources/models/supported-models.md.txt b/latest/_sources/models/supported-models.md.txt
index 85846974d1..eafbc11b3f 100644
--- a/latest/_sources/models/supported-models.md.txt
+++ b/latest/_sources/models/supported-models.md.txt
@@ -23,7 +23,7 @@ The following is a table of supported models for the PyTorch backend:
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B` |
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B` |
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B` |
-
+| `Qwen3NextForCausalLM` | Qwen3Next | `Qwen/Qwen3-Next-80B-A3B-Thinking` |
## Model-Feature Support Matrix(Key Models)
@@ -34,6 +34,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
| ------------------------------ | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | --- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ------------------------ | --------------------- | --------------- |
| DeepseekV3ForCausalLM | Yes | Yes | Yes | Yes | Yes [^1] | Yes | No | No | Yes | Yes | Yes [^2] | N/A | Yes | Yes |
| Qwen3MoeForCausalLM | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | N/A | Yes | Yes |
+| Qwen3NextForCausalLM | Yes | Yes | No | Untested | Yes | No | No | No | Yes | Yes | No | No | Untested | Untested |
| Llama4ForConditionalGeneration | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | Yes | Yes | Untested | N/A | Yes | Yes |
| GPT-OSS | Yes | Yes | Yes | Yes | No | No | Yes | No | Yes | Yes | No | N/A | Yes | Yes |
diff --git a/latest/_sources/quick-start-guide.md.txt b/latest/_sources/quick-start-guide.md.txt
index a0bff8d055..7e9432c86f 100644
--- a/latest/_sources/quick-start-guide.md.txt
+++ b/latest/_sources/quick-start-guide.md.txt
@@ -8,7 +8,7 @@ This is the starting point to try out TensorRT LLM. Specifically, this Quick Sta
## Launch Docker on a node with NVIDIA GPUs deployed
```bash
-docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc0
+docker run --rm -it --ipc host --gpus all --ulimit memlock=-1 --ulimit stack=67108864 -p 8000:8000 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc1
```
diff --git a/latest/_sources/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt b/latest/_sources/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt
index 6032aacd4f..43e2a1a46e 100644
--- a/latest/_sources/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt
+++ b/latest/_sources/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md.txt
@@ -40,29 +40,31 @@ trtllm-bench \
#### Basic Performance Configuration (`autodeploy_config.yaml`)
```yaml
-# Compilation backend
-compile_backend: torch-opt
-
-# Runtime engine
+# runtime engine
runtime: trtllm
-# Model loading
+# model loading
skip_loading_weights: false
-# Fraction of free memory to use for kv-caches
-free_mem_ratio: 0.8
-
-# CUDA Graph optimization
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
-
-# Attention backend
-attn_backend: flashinfer
-
# Sequence configuration
max_batch_size: 256
+
+# transform options
+transforms:
+ insert_cached_attention:
+ # attention backend
+ backend: flashinfer
+ resize_kv_cache:
+ # fraction of free memory to use for kv-caches
+ free_mem_ratio: 0.8
+ compile_model:
+ # compilation backend
+ backend: torch-opt
+ # CUDA Graph optimization
+ cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
```
-Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs
+Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs.
## Configuration Options Reference
diff --git a/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt b/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt
index 76ba2fe2b4..afc55d24f8 100644
--- a/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt
+++ b/latest/_sources/torch/auto_deploy/advanced/expert_configurations.md.txt
@@ -63,15 +63,15 @@ args:
num_hidden_layers: 12
hidden_size: 1024
world_size: 4
- compile_backend: torch-compile
- attn_backend: triton
max_seq_len: 2048
max_batch_size: 16
transforms:
- sharding:
- strategy: auto
- quantization:
- enabled: false
+ detect_sharding:
+ support_partial_config: true
+ insert_cached_attention:
+ backend: triton
+ compile_model:
+ backend: torch-compile
prompt:
batch_size: 8
@@ -79,13 +79,6 @@ prompt:
max_tokens: 150
temperature: 0.8
top_k: 50
-
-benchmark:
- enabled: true
- num: 20
- bs: 4
- isl: 1024
- osl: 256
```
Create an additional override file (e.g., `production.yaml`):
@@ -94,11 +87,10 @@ Create an additional override file (e.g., `production.yaml`):
# production.yaml
args:
world_size: 8
- compile_backend: torch-opt
max_batch_size: 32
-
-benchmark:
- enabled: false
+ transforms:
+ compile_model:
+ backend: torch-opt
```
Then use these configurations:
@@ -107,18 +99,18 @@ Then use these configurations:
# Using single YAML config
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs my_config.yaml
+ --yaml-extra my_config.yaml
# Using multiple YAML configs (deep merged in order, later files have higher priority)
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs my_config.yaml production.yaml
+ --yaml-extra my_config.yaml production.yaml
# Targeting nested AutoDeployConfig with separate YAML
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs my_config.yaml \
- --args.yaml-configs autodeploy_overrides.yaml
+ --yaml-extra my_config.yaml \
+ --args.yaml-extra autodeploy_overrides.yaml
```
## Configuration Precedence and Deep Merging
@@ -126,7 +118,7 @@ python build_and_run_ad.py \
The configuration system follows a precedence order in which higher priority sources override lower priority ones:
1. **CLI Arguments** (highest priority) - Direct command line arguments
-1. **YAML Configs** - Files specified via `--yaml-configs` and `--args.yaml-configs`
+1. **YAML Configs** - Files specified via `--yaml-extra` and `--args.yaml-extra`
1. **Default Settings** (lowest priority) - Built-in defaults from the config classes
**Deep Merging**: Unlike simple overwriting, deep merging recursively combines nested dictionaries. For example:
@@ -152,12 +144,12 @@ args:
**Nested Config Behavior**: When using nested configurations, outer YAML configuration files become initialization settings for inner objects, giving them higher precedence:
```bash
-# The outer yaml-configs affects the entire ExperimentConfig
-# The inner args.yaml-configs affects only the AutoDeployConfig
+# The outer yaml-extra affects the entire ExperimentConfig
+# The inner args.yaml-extra affects only the AutoDeployConfig
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
- --yaml-configs experiment_config.yaml \
- --args.yaml-configs autodeploy_config.yaml \
+ --yaml-extra experiment_config.yaml \
+ --args.yaml-extra autodeploy_config.yaml \
--args.world-size=8 # CLI override beats both YAML configs
```
diff --git a/latest/_sources/torch/auto_deploy/advanced/serving_with_trtllm_serve.md.txt b/latest/_sources/torch/auto_deploy/advanced/serving_with_trtllm_serve.md.txt
index 5a73d047ea..6e52fe4ea4 100644
--- a/latest/_sources/torch/auto_deploy/advanced/serving_with_trtllm_serve.md.txt
+++ b/latest/_sources/torch/auto_deploy/advanced/serving_with_trtllm_serve.md.txt
@@ -42,23 +42,31 @@ trtllm-serve \
Example `autodeploy_config.yaml`:
```yaml
-# Compilation backend for AutoDeploy
-compile_backend: torch-opt # options: torch-simple, torch-compile, torch-cudagraph, torch-opt
+# runtime engine
+runtime: trtllm
-# Runtime engine
-runtime: trtllm # options: trtllm, demollm
+# model loading
+skip_loading_weights: false
-# Model loading
-skip_loading_weights: false # set true for architecture-only perf runs
+# Sequence configuration
+max_batch_size: 256
-# KV cache memory
-free_mem_ratio: 0.8 # fraction of free GPU mem for KV cache
+# multi-gpu execution
+world_size: 1
-# CUDA graph optimization
-cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64]
-
-# Attention backend
-attn_backend: flashinfer # recommended for best performance
+# transform options
+transforms:
+ insert_cached_attention:
+ # attention backend
+ backend: flashinfer
+ resize_kv_cache:
+ # fraction of free memory to use for kv-caches
+ free_mem_ratio: 0.8
+ compile_model:
+ # compilation backend
+ backend: torch-opt
+ # CUDA Graph optimization
+ cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
```
## Limitations and tips
diff --git a/latest/_sources/torch/auto_deploy/advanced/workflow.md.txt b/latest/_sources/torch/auto_deploy/advanced/workflow.md.txt
index 191fa6f276..5debad44d3 100644
--- a/latest/_sources/torch/auto_deploy/advanced/workflow.md.txt
+++ b/latest/_sources/torch/auto_deploy/advanced/workflow.md.txt
@@ -12,15 +12,18 @@ from tensorrt_llm._torch.auto_deploy import LLM
llm = LLM(
model=,
world_size=,
- compile_backend="torch-compile",
- model_kwargs={"num_hidden_layers": 2}, # test with smaller model configuration
- attn_backend="flashinfer", # choose between "triton" and "flashinfer"
- attn_page_size=64, # page size for attention (tokens_per_block, should be == max_seq_len for triton)
- skip_loading_weights=False,
model_factory="AutoModelForCausalLM", # choose appropriate model factory
- mla_backend="MultiHeadLatentAttention", # for models that support MLA
- free_mem_ratio=0.8, # fraction of available memory for cache
- simple_shard_only=False, # tensor parallelism sharding strategy
+ model_kwargs={"num_hidden_layers": 2}, # test with smaller model configuration
+ transforms={
+ "insert_cached_attention": {"backend": "flashinfer"}, # or "triton"
+ "insert_cached_mla_attention": {"backend": "MultiHeadLatentAttention"},
+ "resize_kv_cache": {"free_mem_ratio": 0.8},
+ "compile_model": {"backend": "torch-compile"},
+ "detect_sharding": {"simple_shard_only": False},
+
+ },
+ attn_page_size=64, # page size for attention
+ skip_loading_weights=False,
max_seq_len=,
max_batch_size=,
)
diff --git a/latest/_sources/torch/features/feature_combination_matrix.md.txt b/latest/_sources/torch/features/feature_combination_matrix.md.txt
deleted file mode 100644
index ab939af119..0000000000
--- a/latest/_sources/torch/features/feature_combination_matrix.md.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# Feature Combination Matrix
-
-| Feature | Overlap Scheduler | CUDA Graph | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Slide Window Attention | Logits Post Processor | Guided Decoding |
-| -------------------------- | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | -------- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ---------------------- | --------------------- | --------------- |
-| Overlap Scheduler | --- | | | | | | | | | | | | | |
-| CUDA Graph | Yes | --- | | | | | | | | | | | | |
-| Attention Data Parallelism | Yes | Yes | --- | | | | | | | | | | | |
-| Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | |
-| Chunked Prefill | Yes | Yes | Yes | Yes | --- | | | | | | | | | |
-| MTP | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | | |
-| EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | |
-| EAGLE-3(Two Model Engine) | Yes | Yes | Yes | Yes | Yes | No | No | --- | | | | | | |
-| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | |
-| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | |
-| KV Cache Reuse | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | |
-| Slide Window Attention | Yes | Yes | Yes | Yes | Yes | No | Untested | Untested | Yes | Yes | WIP | --- | | |
-| Logits Post Processor | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | --- | |
-| Guided Decoding | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- |
diff --git a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
index b1467019c6..9ff5e24835 100644
--- a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
+++ b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -337,6 +337,7 @@
In this blog post, we focus on performance optimization, diving deeper into techniques such as lower precision, network structure refactoring, and aggressive kernel fusion. We hope this analysis and optimization process brings new inspiration to your model inference optimization work.
Let’s firstly take a look at how the network structure looks like before we did the optimizations, to give an overall review on how the workloads look like:
+
+
+
+
+
+
Figure 1: Network structure overview before optimization
+
In this third blog of our scaling Expert Parallelism (EP) series, we push the performance boundaries of large-scale EP on NVIDIA GB200 NVL72 through multiple optimization techniques. Building upon the foundation established in part 1 and part 2, this blog explores three key optimization pillars: lower precision computation (including FP4 quantization for wo GEMM, low-precision AlltoAll communication, and FP8 context FMHA), network structure rethinking (featuring MTP LM head tensor parallelism and context phase Q/K/V concatenation elimination), and aggressive kernel fusion and overlap (leveraging Programmatic Dependent Launch, fused AlltoAll operations, and torch.compile optimizations). These optimizations collectively deliver significant end-to-end performance improvements for wide-EP scenarios on NVIDIA GB200 NVL72, for DeepSeek R1 with its specialized Multi-head Latent Attention (MLA) mechanism. Each technique is carefully designed to maintain accuracy while maximizing performance, demonstrating the power of combining algorithmic innovation with deep hardware awareness.
The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1’s MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, “wo” is the abbreviation for the weight matrix for the output.
+
We’ve evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The NVIDIA TensorRT Model Optimizer team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace:
In TensorRT LLM, this is supported by PR 6393. To utilize the checkpoints, simply use the LLM API or trtllm-serve to load them. Refer to deploy-with-tensorrt-llm for more details.
In wide-EP MoE, the combine phase (after experts finish FC2) performs an all-to-all to return each token’s expert outputs to its origin rank, followed by a per-token reduce over top-k experts.
+
This step is typically bandwidth-bound when FC2 outputs are in BF16 or FP16. We introduce a low-precision AlltoAll that transmits these combine payloads in NVFP4 instead of BF16/FP16, then dequantizes back on the receiver before the local reduction.
+
During combine, we temporarily quantize the per-token expert outputs to NVFP4 (e2m1 values with per-16-element E4M3 scale factors plus a global scale) inside shared memory, send the compact representation across GPUs, and dequantize back to the original dtype on the receiving side. Indices and routing-related small tensors remain in their native types.
+
Since we quantize only for transport and outputs are dequantized back to the working dtype before the per-token reduction, we observe negligible accuracy impact; tolerances comparable to a quant-dequant roundtrip are sufficient. This feature is supported by PR 7155 and PR 7898.
FP8 context FMHA is a technique that uses the FP8 data format to accelerate the FMHA/MLA computation during the context phase of a model. This combination is designed to improve TTFT and prefill throughput, particularly when processing long contexts, without significantly sacrificing accuracy.
+
In the context phase, the K and V can be stored in FP8 format, which is often referred to as FP8 KV Cache. Using FP8 KV cache can significantly save GPU memory, which is especially beneficial for long input sequences.
+However, since Q is in BF16 format, FMHA will also be performed in BF16 format, which cannot benefit from FP8 Tensor Core.
+
With FP8 context FMHA, we first quantize Q into FP8 format, which aligns with FP8 K and V, and then leverage FP8 Tensor Core for FMHA/MLA. Since the context phase is compute-bound and Tensor Core has much higher FP8 FLOPS than BF16 FLOPS, the speed-up becomes more pronounced as the input sequence length grows.
+
Since FP8 context FMHA can maintain accuracy very close to the BF16 baseline, we enable it automatically when users use FP8 KV cache on Hopper or Blackwell. This is supported by PR 7610 and PR 7612.
The LM (language modeling) head is responsible for converting the hidden_states computed by previous decode layers to logits. It’s a linear layer with weights in the shape of (vocab_size,hidden_size), outputting logits with the shape of (batch_size,seqlen,vocab_size). We are primarily interested in the logits corresponding to the last token of the input sequence, so the logits will finally be (batch_size,vocab_size).
+
When MTP is enabled, the number of tokens that MTP layers handle will be equal to the batch size, while the main model will handle (1+MTP)*batch_size tokens, which makes the LM head computation on MTP layers easier to fall into the memory-bound range, and 256 tokens is the empirical boundary between memory-bound and math-bound. This leads to an optimization idea: if we keep the calculation memory-bound but reduce the size of weights that need to be loaded, there could be performance benefits.
+
Based on this analysis, we conducted experiments on the following scenario: a DeepSeek R1 EP32 case with attention DP and MTP-3 enabled, where the local per-rank batch size is 32. Before the optimization, there is 32-way data parallelism, so each MTP module on each rank processes 32 tokens for LM head calculation.
+
+
+
+
+
+
Figure 2: MTP LM head computation before optimization
+
In the optimization, we first perform an AllGather on every 4 GPUs, so that each GB200 node has all tokens prepared for the following TP4 calculation. Then, we split LM head weights on the token dimension to fit those 4 GPUs and perform 4-way TP. Afterwards, we collect the local argmax logits on each TP rank, do a round of AllGather to collect that, and find the global argmax logits across all TP ranks. Collecting the local argmax logits firstly helps with minimizing communication and argmax computation overheads. Finally, we split logits to guarantee correctness.
+
+
+
+
+
+
Figure 3: MTP LM head computation after applying tensor parallelism
+
Some layers are omitted in the diagrams above to keep the example simple.
+
Note that we can expand the TP to 8-way to utilize multi-node NVLink, as long as we still achieve performance gains from reducing weight loading time in memory-bound scenarios.
In the standard attention mechanism, Q/K/V are derived from the same hidden states through GEMM_Q/GEMM_K/GEMM_V operations, and TensorRT LLM typically merges the weights of these three GEMMs in advance, executing a single GEMM_QKV to obtain a large contiguous tensor QKV, which is then used as the input to the attention kernels.
+
However, DeepSeek’s MLA is a special attention module where Q/K/V are obtained by applying different downsampling-upsampling processes to the hidden states. Additionally, Q and K are divided into two parts: with RoPE and without RoPE, so a contiguous QKV tensor cannot be obtained directly.
+
In the initial implementation of context MLA, due to input format constraints of the attention kernels, TensorRT LLM had to explicitly concatenate the Q/K/V tensors into one contiguous QKV tensor, resulting in extra memory and time overhead, which became more significant in wide EP scenarios.
+
Recently, we introduced a new input format for the context MLA kernels called “separate qkv”. As the name implies, these attention kernels now support three separate Q/K/V tensors as direct inputs. PR 6538 refactors the MLA process to eliminate the need for concatenating Q/K/V, saving copy operations and significantly improving prefill latency in wide EP scenarios.
The team has implemented aggressive kernel fusion, overlap, and optimization to reduce kernel launch overheads and overall kernel duration. This includes overlapping kernels using PDL, fusing several AlltoAll kernels through refactoring, fusing sparse exp and shared exp add into local reduction, fusing memset into expandinputrow, fusing finalizeMoeRouting into FC2, and removing the swizzle kernel after AlltoAll. The following three representative examples demonstrate the common ideas behind these optimizations.
+
+
Overlap kernels using programmatic dependent launch (PDL)#
+
The Programmatic Dependent Launch (PDL) mechanism allows a dependent secondary kernel to launch before the primary kernel it depends on in the same CUDA stream has finished executing. Refer to the official documentation for more details. TensorRT LLM has been utilizing this feature to optimize end-to-end performance.
+
We have introduced this feature to the kernels used by the wide EP workflow as well. The implementation is in PR 7977. We inserted the cudaTriggerProgrammaticLaunchCompletion API with all thread blocks in the primary kernel, which signals that it’s ready for the secondary kernel to launch, and then call the cudaGridDependencySynchronize API in the secondary kernel, which blocks until all primary kernels the secondary kernel depends on have completed and flushed results to global memory. The following example from the official documentation demonstrates how PDL is supported in TensorRT LLM, the only difference is that we inserted cudaTriggerProgrammaticLaunchCompletion and cudaGridDependencySynchronize to the same kernel so that it can both overlap with the front and subsequent kernels.
+
__global__voidprimary_kernel(){
+// Initial work that should finish before starting secondary kernel
+
+// Trigger the secondary kernel
+cudaTriggerProgrammaticLaunchCompletion();
+
+// Work that can coincide with the secondary kernel
+}
+
+__global__voidsecondary_kernel()
+{
+// Independent work
+
+// Will block until all primary kernels the secondary kernel is dependent on have completed and flushed results to global memory
+cudaGridDependencySynchronize();
+
+// Dependent work
+}
+
+
+
We have verified the accuracy after the modification to ensure that computation results are not affected by incorrect memory reads and writes. With this premise, we made those kernels overlap as much as possible for performance considerations. In TensorRT LLM, PDL can be enabled by setting the environment variable TRTLLM_ENABLE_PDL to 1, and we may introduce this as an official API in the future.
+
The effect of enabling PDL can be clearly observed using NVIDIA Nsight Systems. Taking moeComputeRouteKernel, computeCountAndIndiceDevice and computeCumsumDevice kernels as an example, they are executed in order when disabling PDL:
+
+
+
+
+
+
Figure 4: The profiling results of disabling PDL.
+
The following profiling results show how the three kernels overlap after enabling PDL.
+
+
+
+
+
+
Figure 5: The profiling results of enabling PDL.
+
The above profiles were generated by using commit 84d2f12 on the main branch. They may change in future versions.
To better support communication fusion—including hiddenStates during dispatch, low-precision ScalingFactor, MoE’s tokenSelectedExpert and scales, as well as supporting low-precision communication during dispatch and handling potential non-alignment issues in original data, we redesigned and reimplemented AlltoAll.
+
Taking the dispatch of four fields as an example, the data flow is shown in Figure 6.
+
+
+
+
+
+
Figure 6: The data flow of new Alltoall kernel
+
The sending process is as follows:
+
+
The first step loads the original data according to the data alignment in global memory, using TMA to load into shared memory as unAlignedData.
+
Next, in shared memory, all fields are aligned to 16-byte boundaries and different fields are concatenated together to form alignedData.
+
If low-precision communication is needed, the aligned data is quantized into low-precision lowPrecisionData. Currently, quantization is only supported for a single field.
+
Next, corresponding encoding is performed according to the protocol. For example, with LL128, each 128 bytes contains 120 bytes of valid data and 8 bytes of flags. To avoid bank conflicts during encoding in shared memory, we select different flag positions for different packets, and the final encoded data is stored in protoPackedData+Flag.
+
Finally, the proto-encoded protoPackedData+Flag is written to the remote GPU’s workspace.
+
+
For the receiver, it only needs to check the flag at the corresponding position in the workspace to confirm whether the data is ready. If ready, the original data is decoded in the reverse manner of sending and written to the corresponding tensors.
+
Through this approach, we can support sending and receiving multiple arbitrarily aligned fields in a fused manner and support low-precision communication during the combine process. This feature was implemented in PR 6973.
+
+
+
Fuse add (sparse exp and shared exp) into local reduction#
+
To reduce the number of kernel launches and achieve better overlap at the tail of the MoE module, we’ve fused the shared-expert add into the local reduction kernel that aggregates top-k experts. This removes the extra add operator without increasing the reduce operator’s overhead. It also achieves single write-out and lower bandwidth occupancy.
+
The optimization is compatible with NVFP4 combine without requiring any API changes and brings no accuracy impact. It was added by PR 7422.
+
+
+
Optimize PyTorch native copy and concat using torch.compile#
+
We have observed several inefficient copy and concat operations on context phase in wide EP scenarios, and one significant case is copying k_nope in the MLA module. As mentioned in previous section, Q and K are divided into two parts in DeepSeek MLA: with RoPE and without RoPE. In context phase, head size of nope will be 128, and that of rope will be 64, which adds up to 192 head size. However, the FMHA kernel will directly read Q and K with head size 192, which means that we have to prepare the full Q and K using copy and concat.
+
On ISL/OSL 8k/1k, batch size 1 cases, on context phase, we observed that the copy operation takes 306us, which is clearly suboptimal. If we try to calculate a theoretical duration, considering 8 TB/sec HBM3e bandwidth, the formula would roughly be:
To optimize the operator, we simply added torch.compile decorator to the operation, and the kernel duration directly drops to 107us, which is greatly reduced and already on a promising level. PR 8044 implemented the changes. This is an outstanding example demonstrating the power of torch.compile, and showing the process of analyzing and optimizing without heavily hand-crafting kernels.
After applying the optimizations above, the network structure is cleaner. For example, o_proj and A2Atokens now compute in lower precision, and operators like add of sparse‑expert and shared‑expert is now fused into the reduction. The optimized parts are marked in bold.
+
+
+
+
+
+
Figure 7: Network structure overview after optimization
+
We measured one round of performance and compared it with the baseline (main branch in July). With the optimizations mentioned above, we can see a significant performance improvement.
+
+
+
+
+
+
Figure 8: End-to-End Performance on Aug 31st
+
Note: The numbers were collected on August 31st. Some optimizations mentioned above were not yet added at that time.
+
To review how wide EP helps with Blackwell’s leading inference benchmarks, also read these recent blog posts:
This is a great continuation of previous work on TensorRT-LLM wide EP and another demonstration of excellent teamwork. It stems from brilliant performance optimization ideas, solid performance analysis and benchmarking, and rapid engineering support and implementation. By sharing these experiences, we hope to help more people who are interested in deploying large-scale LLM models on NVIDIA GPUs to run AI faster.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ On this page
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html b/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html
index 92448d2e55..3bc9b757de 100644
--- a/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html
+++ b/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html
@@ -59,7 +59,7 @@
@@ -67,11 +67,11 @@
-
+
-
+
@@ -337,6 +337,7 @@
@@ -1448,9 +1452,9 @@ Based on our current performance analysis, when you plan to apply large-scale EP
diff --git a/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html b/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html
index f7dd387aa8..4f5556f661 100644
--- a/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html
+++ b/latest/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html
@@ -61,7 +61,7 @@
@@ -73,7 +73,7 @@
-
+
@@ -339,6 +339,7 @@
The GEMM plugin that utilizes NVIDIA cuBLASLt to perform GEMM operations. Note: it’s only affective for non-quantized gemm operations (except FP8).Note: For FP8, it also requires same calibration in checkpoint.
--gemm_swiglu_plugin
-
Possible choices: fp8, disable
+
Possible choices: fp8, disable, disable
The GEMM + SwiGLU fusion in Gated-MLP combines two Matmul operations and one SwiGLU operation into a single kernel. Currently this is only supported for FP8 precision on Hopper.
Default: 'disable'
--fp8_rowwise_gemm_plugin
-
Possible choices: auto, float16, float32, bfloat16, int32, disable
+
Possible choices: auto, float16, float32, bfloat16, int32, disable, disable
The quantized GEMM for fp8, which uses per token dynamic scales for activation and per channel static scales for weights.Note: It also requires same calibration in checkpoint.
Default: 'disable'
--nccl_plugin
-
Possible choices: auto, float16, float32, bfloat16, int32, disable
+
Possible choices: auto, float16, float32, bfloat16, int32, disable, disable
The NCCL plugin wraps NCCL operators to support multi-GPU and even multi-nodes.
Default: 'auto'
--lora_plugin
-
Possible choices: auto, float16, float32, bfloat16, int32, disable
+
Possible choices: auto, float16, float32, bfloat16, int32, disable, disable
Enable LoRA.
Default: 'disable'
@@ -792,27 +796,27 @@
Default: 'disable'
--moe_plugin
-
Possible choices: auto, float16, float32, bfloat16, int32, disable
+
Possible choices: auto, float16, float32, bfloat16, int32, disable, disable
Enable some customized kernels to speed up the MoE layer of MoE models.
Default: 'auto'
--mamba_conv1d_plugin
-
Possible choices: auto, float16, float32, bfloat16, int32, disable
+
Possible choices: auto, float16, float32, bfloat16, int32, disable, disable
Enable customized kernels to speed up conv1d operator for Mamba.
Default: 'auto'
--low_latency_gemm_plugin
-
Possible choices: fp8, disable
+
Possible choices: fp8, disable, disable
The GEMM plugin that optimized specially for low latency scenarios.
Default: 'disable'
--low_latency_gemm_swiglu_plugin
-
Possible choices: fp8, disable
+
Possible choices: fp8, disable, disable
The GEMM + SwiGLU fusion plugin that optimized specially for low latency scenarios.
Default: 'disable'
--gemm_allreduce_plugin
-
Possible choices: float16, bfloat16, disable
+
Possible choices: float16, bfloat16, disable, disable
Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):
There are multiple MOE backends inside TRT-LLM, not all of them supporting every precision on every GPUs. Here are the support matrix of the MOE backends.
+
There are multiple MOE backends inside TensorRT LLM, not all of them supporting every precision on every GPUs. Here are the support matrix of the MOE backends.
Below is an example command to launch the TRT-LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
Below is an example command to launch the TensorRT LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
When the Status:200 code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-
After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+
After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
curlhttp://localhost:8000/v1/completions-H"Content-Type: application/json"-d'{ "model": "deepseek-ai/DeepSeek-R1-0528", "prompt": "Where is New York?",
@@ -783,7 +782,7 @@ cat<< EOF > ${EXTRA_LLM_API_FIL
}'
-
Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
+
Here is an example response, showing that the TensorRT LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
{"id":"cmpl-e728f08114c042309efeae4df86a50ca","object":"text_completion","created":1754294810,"model":"deepseek-ai/DeepSeek-R1-0528","choices":[{"index":0,"text":" / by Megan Stine ; illustrated by John Hinderliter.\n\nBook | Gross","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null}
To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark_serving.py” script. To do this first creating a wrapper bench.sh script.
+
To benchmark the performance of your TensorRT LLM server you can leverage the built-in benchmark_serving.py script. To do this first creating a wrapper bench.sh script.
Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):
Below is an example command to launch the TRT-LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
Below is an example command to launch the TensorRT LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
When the Status:200 code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-
After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+
After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
Here is an example response, showing that the TRT-LLM server reasons and answers the questions.
+
Here is an example response, showing that the TensorRT LLM server reasons and answers the questions.
TODO: Use Chat Compeletions API / Responses API as the example after the PR is merged.
{"id":"chatcmpl-c5bf51b5cab94e10ba5da5266d12ee59","object":"chat.completion","created":1755815898,"model":"openai/gpt-oss-120b","choices":[{"index":0,"message":{"role":"assistant","content":"analysisThe user asks: \"Where is New York?\" Likely they want location info. Provide answer: New York State in northeastern US, New York City on the east coast, coordinates, etc. Provide context.assistantfinal**New York** can refer to two related places in the United States:\n\n| What it is | Where it is | Approx. coordinates | How to picture it |\n|------------|------------|--------------------|-------------------|\n| **New York State** | The northeastern corner of the United States, bordered by **Vermont, Massachusetts, Connecticut, New Jersey, Pennsylvania, and the Canadian provinces of Ontario and Quebec**. | 42.7° N, 75.5° W (roughly the state’s geographic centre) | A roughly rectangular state that stretches from the Atlantic Ocean in the southeast to the Adirondack Mountains and the Great Lakes region in the north. |\n| **New York City (NYC)** | The largest city in the state, located on the **southern tip of the state** where the **Hudson River meets the Atlantic Ocean**. It occupies five boroughs: Manhattan, Brooklyn, Queens, The Bronx, and Staten Island. | 40.7128° N, 74.0060° W | A dense, world‑famous metropolis that sits on a series of islands (Manhattan, Staten Island, parts of the Bronx) and the mainland (Brooklyn and Queens). |\n\n### Quick geographic context\n- **On a map of the United States:** New York State is in the **Northeast** region, just east of the Great Lakes and north of Pennsylvania. \n- **From Washington, D.C.:** Travel roughly **225 mi (360 km) northeast**. \n- **From Boston, MA:** Travel about **215 mi (350 km) southwest**. \n- **From Toronto, Canada:** Travel about **500 mi (800 km) southeast**.\n\n### Travel tips\n- **By air:** Major airports include **John F. Kennedy International (JFK)**, **LaGuardia (LGA)**, and **Newark Liberty International (EWR)** (the latter is actually in New Jersey but serves the NYC metro area). \n- **By train:** Amtrak’s **Northeast Corridor** runs from **Boston → New York City → Washington, D.C.** \n- **By car:** Interstates **I‑87** (north‑south) and **I‑90** (east‑west) are the primary highways crossing the state.\n\n### Fun fact\n- The name “**New York**” was given by the English in 1664, honoring the Duke of York (later King James II). The city’s original Dutch name was **“New Amsterdam.”**\n\nIf you need more specific directions (e.g., how to get to a particular neighborhood, landmark, or the state capital **Albany**), just let me know!","reasoning_content":null,"tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null,"mm_embedding_handle":null,"disaggregated_params":null,"avg_decoded_tokens_per_iter":1.0}],"usage":{"prompt_tokens":72,"total_tokens":705,"completion_tokens":633},"prompt_token_ids":null}
Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):
GPU: NVIDIA Blackwell or Hopper Architecture
-OS: Linux
-Drivers: CUDA Driver 575 or Later
-Docker with NVIDIA Container Toolkit installed
+
GPU: NVIDIA Blackwell or Hopper Architecture
+OS: Linux
+Drivers: CUDA Driver 575 or Later
+Docker with NVIDIA Container Toolkit installed
Python3 and python3-pip (Optional, for accuracy evaluation only)
Below is an example command to launch the TRT-LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
Below is an example command to launch the TensorRT LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
Description: A value between 0.0 and 1.0 that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
Recommendation: If you experience OOM errors, try reducing this value to 0.8 or lower.
When the Status:200 code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-
After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+
After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
curlhttp://localhost:8000/v1/completions-H"Content-Type: application/json"-d'{ "model": "nvidia/Llama-3.3-70B-Instruct-FP8", "prompt": "Where is New York?",
@@ -681,7 +680,7 @@ cat<< EOF > ${EXTRA_LLM_API_FIL
}'
-
Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
+
Here is an example response, showing that the TensorRT LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
{"id":"cmpl-bc1393d529ce485c961d9ffee5b25d72","object":"text_completion","created":1753843963,"model":"nvidia/Llama-3.3-70B-Instruct-FP8","choices":[{"index":0,"text":" New York is a state located in the northeastern United States. It is bordered by","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null}
To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark_serving.py” script. To do this first creating a wrapper bench.sh script.
+
To benchmark the performance of your TensorRT LLM server you can leverage the built-in benchmark_serving.py script. To do this first creating a wrapper bench.sh script.
Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):
Below is an example command to launch the TRT-LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
Below is an example command to launch the TensorRT LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
When the Status:200 code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
-
After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
+
After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
curlhttp://localhost:8000/v1/completions-H"Content-Type: application/json"-d'{ "model": "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", "prompt": "Where is New York?",
@@ -715,7 +714,7 @@ cat<< EOF > ${EXTRA_LLM_API_FIL
}'
-
Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
+
Here is an example response, showing that the TensorRT LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
{"id":"cmpl-bc1393d529ce485c961d9ffee5b25d72","object":"text_completion","created":1753843963,"model":"$MODEL","choices":[{"index":0,"text":" New York is a state located in the northeastern United States. It is bordered by","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null}
Across different requests, average TPOT is the mean of each request’s TPOT (all requests weighted equally), while average ITL is token-weighted (all tokens weighted equally):
This is a functional quick-start guide for running the Qwen3-Next model on TensorRT LLM. It focuses on a working setup with recommended defaults. Additional performance optimizations and support will be rolled out in future updates.
Description: Sets the expert-parallel size for Mixture-of-Experts (MoE) models. Like tp_size, this should generally match the number of GPUs you’re using. This setting has no effect on non-MoE models.
Description: A value between 0.0 and 1.0 that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
+
Recommendation: If you experience OOM errors, try reducing this value to 0.7 or lower.
Description: The maximum number of user requests that can be grouped into a single batch for processing. The actual max batch size that can be achieved depends on total sequence length (input + output).
Description: The maximum possible sequence length for a single request, including both input and generated output tokens. We won’t specifically set it. It will be inferred from model config.
These options provide finer control over performance and are set within a YAML file passed to the trtllm-serve command via the --extra_llm_api_options argument.
When the Status:200 code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
+
After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
{"id":"chatcmpl-64ac201c77bf46a7a3a4eca7759b1fd8","object":"chat.completion","created":1759022940,"model":"Qwen/Qwen3-Next-80B-A3B-Thinking","choices":[{"index":0,"message":{"role":"assistant","content":"Okay, the user is asking \"Where is New York?\" Hmm, this seems straightforward but I need to be careful. New York could mean different things—maybe they're confused about the city versus the state. \n\nFirst thought: Are they a tourist planning a trip? Or maybe a student doing homework? Could even be someone国外 who's only heard \"New York\" in movies and isn't sure if it's a city or state. \n\nI should clarify both possibilities immediately. People often mix them up. Like, if someone says \"I'm going to New York\" they're probably talking about NYC, but technically New York State is bigger. \n\nLet me break it down: \n- New York City (NYC) is the famous one—Manhattan, skyscrapers, Times Square. \n- Then New York State (NY) is the whole state, which includes NYC but also upstate areas like Albany (the capital), Buffalo, and even the Adirondacks. \n\nWait, should I mention that NYC is in New York State? Yeah, that's crucial. Otherwise they might think it's two separate things. Also, where is the state located? Northeast US, borders other states like Pennsylvania, New Jersey... maybe name a few neighbors for context. \n\nOh! And the city vs state confusion is super common. Like, people say \"New York\" for the city but forget it's part of a larger state. Should I give examples? \n- \"If someone says 'hot dogs in New York' they mean NYC\"\n- But \"the New York State Police\" covers the whole state\"\n\n...Maybe add a fun fact? Like how NYC is just one city within the state? Or that the state capital isn't NYC but Albany? Yeah, that'll help clear confusion. \n\nAlso—should I ask if they meant the city or the state? Nah, better to cover both upfront. Keep it simple but precise. No jargon. They might not know terms like \"metropolitan area\" or \"boroughs,\" so explain \"boroughs\" as neighborhoods (Manhattan, Brooklyn etc.). \n\nCoordinates: 40.7°N, 74°W? Maybe overkill unless they're into geography. But for the state, roughly 42°N to 45°N latitude? Hmm, probably unnecessary. Stick to landmarks: borders Lake Erie, Atlantic coast... \n\nWait—user might be asking literally for GPS coordinates? Unlikely, but just in case I can add: \"Geographically, New York City is at approximately 40.7°N latitude and 74.0°W longitude...\" but only if the answer feels too dry otherwise. \n\nPriority: Clear, simple, correct. No assuming prior knowledge. Say \"New York can refer to...\" then split into two cases. End with \"Most people mean the city when they say 'New York' in casual talk.\" That’s the key takeaway. \n\n...Double-check: Is Albany really the capital? Yes. And NYC is the largest city but not capital. Got it. Also, New York State is the 27th largest by area? Maybe not relevant. Stick to basics. \n\nFinal structure: \n1. \"New York can mean two things: the city or the state.\"\n2. For city: \n - Where it is (Northeast US) \n - Inside New York State \n - Famous parts (Manhattan etc.) \n3. For state: \n - Larger area, includes city \n - Borders (PA, NJ, CT, MA, VT, Canada) \n - Capital is Albany (not NYC!) \n4. Clear summary: \"In everyday talk, 'New York' usually means NYC.\"\n\nYeah. That covers it without overwhelming them. No markdown, just plain English. Keep it warm—add a \"hope that helps!\" at the end.\n</think>\n\nThat's a common point of confusion! **\"New York\" can refer to two different things**, and it's important to distinguish between them:\n\n### 1. **New York City (NYC)** \n - **What it is**: The largest and most famous city in the United States, known for landmarks like Times Square, Central Park, the Statue of Liberty, and Wall Street. \n - **Where it is**: \n - Located in the **northeastern United States**. \n - Situated at the mouth of the **Hudson River**, where it meets the **Atlantic Ocean**. \n - Part of **New York State** (see below). \n - **Geographic details**: \n - Coordinates: Approximately **40.7° N latitude, 74.0° W longitude**. \n - Composed of **5 boroughs**: Manhattan (the \"city\" most people picture), Brooklyn, Queens, The Bronx, and Staten Island. \n - Panoramic view of NYC (including Brooklyn and New Jersey skyline):","reasoning_content":null,"reasoning":null,"tool_calls":[]},"logprobs":null,"finish_reason":"length","stop_reason":null,"mm_embedding_handle":null,"disaggregated_params":null,"avg_decoded_tokens_per_iter":1.0}],"usage":{"prompt_tokens":15,"total_tokens":1039,"completion_tokens":1024},"prompt_token_ids":null}
+
To benchmark the performance of your TensorRT LLM server you can leverage the built-in benchmark_serving.py script. To do this first creating a wrapper bench.sh script.
To run the benchmark with the generated data set, simply use the trtllm-benchthroughput subcommand. The benchmarker will
run an offline maximum throughput scenario such that all requests are queued in rapid succession. You simply need to provide
-a model name (HuggingFace reference or path to a local model), a generated dataset, and a file containing any desired extra options to the LLM APIs (details in tensorrt_llm/llmapi/llm_args.py:LlmArgs).
@@ -837,7 +841,7 @@ reach that point).
the different requests by a cache manager during processing. That cache manager
keeps track of the sequences, allocates new blocks from a pool and recycles those
blocks when required. See the implementation of
-KVCacheManager.
+KVCacheManager.
# Using single YAML config
pythonbuild_and_run_ad.py\--model"meta-llama/Meta-Llama-3.1-8B-Instruct"\
---yaml-configsmy_config.yaml
+--yaml-extramy_config.yaml
# Using multiple YAML configs (deep merged in order, later files have higher priority)
pythonbuild_and_run_ad.py\--model"meta-llama/Meta-Llama-3.1-8B-Instruct"\
---yaml-configsmy_config.yamlproduction.yaml
+--yaml-extramy_config.yamlproduction.yaml
# Targeting nested AutoDeployConfig with separate YAML
pythonbuild_and_run_ad.py\--model"meta-llama/Meta-Llama-3.1-8B-Instruct"\
---yaml-configsmy_config.yaml\
---args.yaml-configsautodeploy_overrides.yaml
+--yaml-extramy_config.yaml\
+--args.yaml-extraautodeploy_overrides.yaml
@@ -618,7 +614,7 @@ pythonbuild_and_run_ad.pyThe configuration system follows a precedence order in which higher priority sources override lower priority ones:
CLI Arguments (highest priority) - Direct command line arguments
-
YAML Configs - Files specified via --yaml-configs and --args.yaml-configs
+
YAML Configs - Files specified via --yaml-extra and --args.yaml-extra
Default Settings (lowest priority) - Built-in defaults from the config classes
Deep Merging: Unlike simple overwriting, deep merging recursively combines nested dictionaries. For example:
@@ -639,12 +635,12 @@ pythonbuild_and_run_ad.py
Nested Config Behavior: When using nested configurations, outer YAML configuration files become initialization settings for inner objects, giving them higher precedence:
-
# The outer yaml-configs affects the entire ExperimentConfig
-# The inner args.yaml-configs affects only the AutoDeployConfig
+
# The outer yaml-extra affects the entire ExperimentConfig
+# The inner args.yaml-extra affects only the AutoDeployConfig
pythonbuild_and_run_ad.py\--model"meta-llama/Meta-Llama-3.1-8B-Instruct"\
---yaml-configsexperiment_config.yaml\
---args.yaml-configsautodeploy_config.yaml\
+--yaml-extraexperiment_config.yaml\
+--args.yaml-extraautodeploy_config.yaml\--args.world-size=8# CLI override beats both YAML configs
@@ -600,7 +604,7 @@ This feature is currently in beta, and the related APIs are subjected to change
max_tokens_in_buffer:<int>
-
backend specifies the communication backend for transferring the kvCache, valid options include DEFAULT,UCX, NIXL, and MPI, the default backend is UCX.
+
backend specifies the communication backend for transferring the kvCache, valid options include DEFAULT,UCX, NIXL, and MPI, the default backend is NIXL.
max_tokens_in_buffer defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance.
For example, you could launch two context servers and one generation servers as follows:
@@ -894,9 +898,9 @@ when routing requests to the generation servers, the disaggregated server will m
diff --git a/latest/features/feature-combination-matrix.html b/latest/features/feature-combination-matrix.html
index 802cb8c5f8..8ec0b632cd 100644
--- a/latest/features/feature-combination-matrix.html
+++ b/latest/features/feature-combination-matrix.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -333,6 +333,7 @@
The following examples demonstrate how to use TensorRT LLM’s multimodal support in various scenarios, including quick run examples, serving endpoints, and performance benchmarking.
You can then send OpenAI-compatible requests, such as via curl or API clients, to the server endpoint. See curl chat client for multimodal script as an example.
+
You can then send OpenAI-compatible requests, such as via curl or API clients, to the server endpoint. See curl chat client for multimodal script as an example.
@@ -587,9 +591,9 @@ different types of KV caches: contiguous and pagedThe paged KV cache decomposes the KV cache into blocks that are distributed to
the different requests by a cache manager during processing. That cache manager
keeps track of the sequences, allocates new blocks from a pool, and recycles those blocks when required. See the simplified implementation of
-tensorrt_llm.runtime.KVCacheManager.
+tensorrt_llm.runtime.KVCacheManager.
A more efficient C++ implementation is included in the
-Batch Manager.
+Batch Manager.
@@ -781,9 +785,9 @@ A more efficient C++ implementation is included in the
diff --git a/latest/features/parallel-strategy.html b/latest/features/parallel-strategy.html
index 8ae0cc1071..d04b12c523 100644
--- a/latest/features/parallel-strategy.html
+++ b/latest/features/parallel-strategy.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -337,6 +337,7 @@
This project is under active development and currently in a prototype stage. The current focus is on core functionality, with performance optimization coming soon. While we strive for correctness, there are currently no guarantees regarding functionality, stability, or reliability.
The Ray orchestrator uses Ray instead of MPI to manage workers for single- and multi-node inference. It’s a first step toward making TensorRT-LLM a better fit for Reinforcement Learning from Human Feedback (RLHF) workflows. For RLHF, Ray can dynamically spawn and reconnect distributed inference actors, each with its own parallelism strategy. This feature is a prototype and under active development. MPI remains the default in TensorRT-LLM.
To run a simple TP=2 example with a Hugging Face model:
+
pythonllm_inference_distributed_ray.py
+
+
+
This example is the same as in /examples/llm-api, with the only change being orchestrator_type="ray" on LLM(). Other examples can be adapted similarly by toggling this flag.
Initial testing has been focused on LLaMA and DeepSeek variants. Please open an Issue if you encounter problems with other models so we can prioritize support.
This feature introduces new classes such as RayExecutor and RayGPUWorker for Ray actor lifecycle management and distributed inference. In Ray mode, collective ops run on torch.distributed without MPI. We welcome contributions to improve and extend this support.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ On this page
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/latest/features/sampling.html b/latest/features/sampling.html
index 24b4127e71..5546a6b010 100644
--- a/latest/features/sampling.html
+++ b/latest/features/sampling.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -337,6 +337,7 @@
@@ -926,9 +930,9 @@ function. In practice, this is very cheap since the blocks are just marked as av
diff --git a/latest/genindex.html b/latest/genindex.html
index 5fc798d96f..e70bc9e3b2 100644
--- a/latest/genindex.html
+++ b/latest/genindex.html
@@ -58,7 +58,7 @@
@@ -68,7 +68,7 @@
-
+
@@ -330,6 +330,7 @@
TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0 while some dependency changes are required. The requirements.txt contains dependencies needed by CUDA 13.0. If you are using CUDA 12.9, please uncomment lines end with #<ForCUDA12.9> and comment out the next lines.
The following command compiles the C++ code and packages the compiled libraries along with the Python files into a wheel. When developing C++ code, you need this full build command to apply your code changes.
@@ -506,7 +510,7 @@ on NGC. This is likely the simplest way to obtain TensorRT LLM. Please refer to
Container image tags
In the example shell commands, x.y.z corresponds to the TensorRT-LLM container
version to use. If omitted, IMAGE_TAG will default to tensorrt_llm.__version__
-(e.g., this documentation was generated from the 1.2.0rc0 source tree).
+(e.g., this documentation was generated from the 1.2.0rc1 source tree).
If this does not work, e.g., because a container for the version you are
currently working with has not been released yet, you can try using a
container published for a previous
@@ -646,9 +650,9 @@ for all related options.
TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0. The wheel package release only supports CUDA 12.9, while CUDA 13.0 is only supported through NGC container release.
+
# Optional step: Only required for NVIDIA Blackwell GPUs and SBSA platform
pip3installtorch==2.7.1torchvisiontorchaudio--index-urlhttps://download.pytorch.org/whl/cu128
sudoapt-get-yinstalllibopenmpi-dev
+
+# Optional step: Only required for disagg-serving
+sudoapt-get-yinstalllibzmq3-dev
PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs and SBSA platform. On prior GPUs or Linux x86_64 platform, this extra installation is not required.
@@ -710,9 +721,9 @@ The setup methods depends on your slurm configuration, pls check with your admin
diff --git a/latest/legacy/advanced/disaggregated-service.html b/latest/legacy/advanced/disaggregated-service.html
index 5b6a9e9c9d..ab36911c34 100644
--- a/latest/legacy/advanced/disaggregated-service.html
+++ b/latest/legacy/advanced/disaggregated-service.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -335,6 +335,7 @@
@@ -690,9 +694,9 @@ This feature is currently in prototype, and the related API is subjected to chan
diff --git a/latest/legacy/advanced/executor.html b/latest/legacy/advanced/executor.html
index b689c49d24..3476a16853 100644
--- a/latest/legacy/advanced/executor.html
+++ b/latest/legacy/advanced/executor.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -335,6 +335,7 @@
TensorRT-LLM includes a high-level C++ API called the Executor API which allows you to execute requests
asynchronously, with in-flight batching, and without the need to define callbacks.
A software component (referred to as “the client” in the text that follows) can interact
-with the executor using the API defined in the executor.h file.
+with the executor using the API defined in the executor.h file.
For details about the API, refer to the _cpp_gen/executor.rst.
The following sections provide an overview of the main classes defined in the Executor API.
@@ -570,7 +574,7 @@ This allows the runtime to reconfigure itself for a new beam width when no reque
stop_token_ids=[tokenizer.eos_token_id]
-
Refer to tensorrt_llm/llmapi/tokenizer.py for more details. You may dump these materials to disk, and reload them to C++ runtime for use.
+
Refer to tensorrt_llm/llmapi/tokenizer.py for more details. You may dump these materials to disk, and reload them to C++ runtime for use.
Each request can be optionally specified with a GuidedDecodingParams, which defines the desired structured format. Currently, it supports four types:
GuidedDecodingParams::GuideType::kJSON: The generated text is amenable to JSON format;
@@ -619,12 +623,12 @@ This allows the runtime to reconfigure itself for a new beam width when no reque
Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in bindings.cpp and once built, are available in package tensorrt_llm.bindings.executor. Running 'help('tensorrt_llm.bindings.executor') in a Python interpreter will provide an overview of the classes available.
-
In addition, three Python examples are provided to demonstrate how to use the Python bindings to the Executor API for single and multi-GPU models. They can be found in examples/bindings.
+
Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in bindings.cpp and once built, are available in package tensorrt_llm.bindings.executor. Running 'help('tensorrt_llm.bindings.executor') in a Python interpreter will provide an overview of the classes available.
+
In addition, three Python examples are provided to demonstrate how to use the Python bindings to the Executor API for single and multi-GPU models. They can be found in examples/bindings.
In-flight Batching with the Triton Inference Server#
@@ -670,9 +674,9 @@ reach that point).
the different requests by a cache manager during processing. That cache manager
keeps track of the sequences, allocate new blocks from a pool and recycle those
blocks when required. See the simplified implementation of
-tensorrt_llm.runtime.KVCacheManager.
+tensorrt_llm.runtime.KVCacheManager.
A more efficient C++ implementation is included in the
-Batch Manager.
+Batch Manager.
@@ -965,9 +969,9 @@ is computed as:
diff --git a/latest/legacy/advanced/gpt-runtime.html b/latest/legacy/advanced/gpt-runtime.html
index 861968002c..0e73b0339d 100644
--- a/latest/legacy/advanced/gpt-runtime.html
+++ b/latest/legacy/advanced/gpt-runtime.html
@@ -61,7 +61,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -337,6 +337,7 @@
@@ -733,9 +737,9 @@ An “event” is any significant change in the lifecycle or state of a KV cache
diff --git a/latest/legacy/advanced/kv-cache-reuse.html b/latest/legacy/advanced/kv-cache-reuse.html
index 812ad1a630..c95bee6b7a 100644
--- a/latest/legacy/advanced/kv-cache-reuse.html
+++ b/latest/legacy/advanced/kv-cache-reuse.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -335,6 +335,7 @@
@@ -706,9 +710,9 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
diff --git a/latest/legacy/advanced/lora.html b/latest/legacy/advanced/lora.html
index 63882d7269..2efe191334 100644
--- a/latest/legacy/advanced/lora.html
+++ b/latest/legacy/advanced/lora.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -335,6 +335,7 @@
@@ -820,9 +824,9 @@ However, similar to any new model, you can follow the same approach to define yo
diff --git a/latest/legacy/advanced/weight-streaming.html b/latest/legacy/advanced/weight-streaming.html
index 4050840cfa..95a7466d49 100644
--- a/latest/legacy/advanced/weight-streaming.html
+++ b/latest/legacy/advanced/weight-streaming.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -335,6 +335,7 @@
@@ -506,7 +510,7 @@ to create graph representations of deep neural networks in TensorRT. To become
familiar with the core concepts of the TensorRT API, refer to the
Core Concepts
section of the TensorRT documentation before proceeding further.
-
In TensorRT-LLM, the tensorrt_llm.Builder class
contains a
tensorrt.Builder
object. That instance is used in the tensorrt_llm.Builder.create_network
@@ -514,7 +518,7 @@ method to create an instance of the
tensorrt.INetworkDefinition
class. The INetworkDefinition object can then be populated using the free
functions defined in the
-tensorrt_llm.functional.
A simple example of such a free function is tensorrt_llm.activation that inserts a
tensorrt.IActivationLayer
node in the graph of the model:
@@ -649,14 +653,14 @@ limitation, TensorRT offers a powerful mechanism known as
plugins.
The plugins are nodes inserted in the network graph definition that map to user-defined
GPU kernels. TensorRT-LLM uses a number of such plugins. They can be found in
-the cpp/tensorrt_llm/plugins directory.
Plugins are written in C++ and follow a well-defined interface described in the
Extending TensorRT with Custom Layers
section of the TensorRT
Developer Guide.
When executed within a TensorRT engine, plugins trigger the execution of
their encapsulated GPU kernels. A fairly simple example of plugins is the
-QuantizeTensorPlugin that
+QuantizeTensorPlugin that
triggers a CUDA kernel in the QuantizeTensorPlugin::enqueue member function:
// In cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp:
@@ -700,7 +704,7 @@ using TensorRT plugins that wrap communication primitives from the
plugin that optimize the All-Reduce primitive in the presence of All-to-all
connections between GPUs (through NVSwitch in DGX systems).
@@ -905,9 +909,9 @@ This can be enabled via the LLM-API as such
diff --git a/latest/legacy/performance/performance-tuning-guide/useful-runtime-flags.html b/latest/legacy/performance/performance-tuning-guide/useful-runtime-flags.html
index d0a086a1a0..f0cd6da948 100644
--- a/latest/legacy/performance/performance-tuning-guide/useful-runtime-flags.html
+++ b/latest/legacy/performance/performance-tuning-guide/useful-runtime-flags.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -335,6 +335,7 @@
@@ -1251,7 +1255,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
num_classes:int,
hidden_size:int,
dropout_prob:float=0.0,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -1470,7 +1474,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
embedding_dim,
pooled_projection_dim,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -1498,7 +1502,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
num_classes,
embedding_dim,
class_dropout_prob=0.0,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -1871,7 +1875,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
dim_out:int,
approximate:str='tanh',
bias:bool=True,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -1893,7 +1897,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
dim_out:int,
approximate:str='tanh',
bias:bool=True,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -1914,7 +1918,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
dim_in:int,
dim_out:int,
bias:bool=True,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -1935,7 +1939,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
dim_in:int,
dim_out:int,
bias:bool=True,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -1957,7 +1961,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
dim_out:int,
bias:bool=True,
activation:str='silu',
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -2049,7 +2053,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
norm_elementwise_affine:bool=False,
norm_eps:float=1e-05,
chunk_dim:int=0,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -2079,7 +2083,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
num_embeddings:int|None=None,
norm_type:str='layer_norm',
bias:bool=True,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -2110,7 +2114,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
embedding_dim:int,
norm_type:str='layer_norm',
bias:bool=True,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -2141,7 +2145,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
eps:float=1e-05,
bias:bool=True,
norm_type:str='layer_norm',
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -2169,7 +2173,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
embedding_dim:int,
norm_type:str='layer_norm',
bias:bool=True,
-
mapping=<tensorrt_llm.mapping.Mappingobject>,
+
mapping=<tensorrt_llm.mapping.MpiTopologyobject>,
dtype=None,
@@ -2577,9 +2581,9 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
diff --git a/latest/legacy/python-api/tensorrt_llm.models.html b/latest/legacy/python-api/tensorrt_llm.models.html
index 89864c8e4a..1fbd9d6929 100644
--- a/latest/legacy/python-api/tensorrt_llm.models.html
+++ b/latest/legacy/python-api/tensorrt_llm.models.html
@@ -59,7 +59,7 @@
@@ -69,7 +69,7 @@
-
+
@@ -335,6 +335,7 @@
There are two option categories:
* Plugin options (typically with xxx_plugin naming). These options can be assigned with:
@@ -523,8 +527,912 @@
-
Note: All the fields should use a prefix “_”; PluginConfigMeta will wrap each field as a property.
-This ensures the fields can only be assigned with allowed values.
+
+Show JSON schema
{
+"title":"PluginConfig",
+"description":"The config that manages plugin-related options.\n\nThere are two option categories:\n* Plugin options (typically with xxx_plugin naming). These options can be assigned with:\n * \"float16\"/\"bfloat16\"/\"float32\"/\"int32\", which means the plugin is enabled with the specified precision; (Some plugins only support limited dtype, i.e., gemm_swiglu_plugin and low_latency_gemm_swiglu_plugin only supports fp8 now)\n * \"auto\", which means the plugin is enabled with the precision of `dtype` field (the `dtype` field must be same to model dtype, i.e., the one in PretrainedConfig);\n * None, which means the plugin is disabled.\n* Other features. These options can be assigned with boolean:\n * True, which means the plugin is enabled;\n * False, which means the plugin is disabled.",
+"type":"object",
+"properties":{
+"dtype":{
+"default":"float16",
+"description":"Base dtype for the model and plugins",
+"title":"Dtype",
+"type":"string"
+},
+"bert_attention_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":"auto",
+"description":"The plugin that uses efficient kernels and enables an in-place update of the KV cache for attention layer of BERT-like encoder models.",
+"title":"Bert Attention Plugin"
+},
+"gpt_attention_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":"auto",
+"description":"The plugin that uses efficient kernels and enables an in-place update of the KV cache for attention layer of GPT-like decoder models.",
+"title":"Gpt Attention Plugin"
+},
+"gemm_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+"fp8",
+"nvfp4",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The GEMM plugin that utilizes NVIDIA cuBLASLt to perform GEMM operations. Note: it's only affective for non-quantized gemm operations (except FP8).Note: For FP8, it also requires same calibration in checkpoint.",
+"title":"Gemm Plugin"
+},
+"gemm_swiglu_plugin":{
+"anyOf":[
+{
+"enum":[
+"fp8",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The GEMM + SwiGLU fusion in Gated-MLP combines two Matmul operations and one SwiGLU operation into a single kernel. Currently this is only supported for FP8 precision on Hopper.",
+"title":"Gemm Swiglu Plugin"
+},
+"fp8_rowwise_gemm_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The quantized GEMM for fp8, which uses per token dynamic scales for activation and per channel static scales for weights.Note: It also requires same calibration in checkpoint.",
+"title":"Fp8 Rowwise Gemm Plugin"
+},
+"qserve_gemm_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The quantized GEMM from [QServe](https://arxiv.org/abs/2405.04532), which employs 4-bit quantization for weights and 8-bit quantization for activations.",
+"title":"Qserve Gemm Plugin"
+},
+"identity_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The identity plugin simply copies inputs to outputs, it's used mostly for debugging purpose.",
+"title":"Identity Plugin"
+},
+"nccl_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":"auto",
+"description":"The NCCL plugin wraps NCCL operators to support multi-GPU and even multi-nodes.",
+"title":"Nccl Plugin"
+},
+"lora_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"Enable LoRA.",
+"title":"Lora Plugin"
+},
+"dora_plugin":{
+"default":false,
+"description":"Enable DoRA.",
+"title":"Dora Plugin",
+"type":"boolean"
+},
+"weight_only_groupwise_quant_matmul_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"Enable weight-only groupwise quantization matmul operators.",
+"title":"Weight Only Groupwise Quant Matmul Plugin"
+},
+"weight_only_quant_matmul_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"Enable weight-only quantization matmul operators.",
+"title":"Weight Only Quant Matmul Plugin"
+},
+"smooth_quant_plugins":{
+"default":true,
+"description":"Enable a group of plugins to support smooth quantization.",
+"title":"Smooth Quant Plugins",
+"type":"boolean"
+},
+"smooth_quant_gemm_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"Enable plugin that supports smooth quantization gemm kernels.",
+"title":"Smooth Quant Gemm Plugin"
+},
+"layernorm_quantization_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"Enable plugin that supports layernorm quantization kernels.",
+"title":"Layernorm Quantization Plugin"
+},
+"rmsnorm_quantization_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"Enable plugin that supports rmsnorm quantization kernels.",
+"title":"Rmsnorm Quantization Plugin"
+},
+"quantize_per_token_plugin":{
+"default":false,
+"description":"Enable plugin that supports per-token quantization.",
+"title":"Quantize Per Token Plugin",
+"type":"boolean"
+},
+"quantize_tensor_plugin":{
+"default":false,
+"description":"Enable plugin that supports per-tensor quantization.",
+"title":"Quantize Tensor Plugin",
+"type":"boolean"
+},
+"moe_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":"auto",
+"description":"Enable some customized kernels to speed up the MoE layer of MoE models.",
+"title":"Moe Plugin"
+},
+"mamba_conv1d_plugin":{
+"anyOf":[
+{
+"enum":[
+"auto",
+"float16",
+"float32",
+"bfloat16",
+"int32",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":"auto",
+"description":"Enable customized kernels to speed up conv1d operator for Mamba.",
+"title":"Mamba Conv1D Plugin"
+},
+"low_latency_gemm_plugin":{
+"anyOf":[
+{
+"enum":[
+"fp8",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The GEMM plugin that optimized specially for low latency scenarios.",
+"title":"Low Latency Gemm Plugin"
+},
+"low_latency_gemm_swiglu_plugin":{
+"anyOf":[
+{
+"enum":[
+"fp8",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The GEMM + SwiGLU fusion plugin that optimized specially for low latency scenarios.",
+"title":"Low Latency Gemm Swiglu Plugin"
+},
+"gemm_allreduce_plugin":{
+"anyOf":[
+{
+"enum":[
+"float16",
+"bfloat16",
+null
+]
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"The GEMM + AllReduce kernel fusion plugin.",
+"title":"Gemm Allreduce Plugin"
+},
+"context_fmha":{
+"default":true,
+"description":"Enable the fused multi-head attention during the context phase, will trigger a kernel that performs the MHA/MQA/GQA block using a single kernel.",
+"title":"Context Fmha",
+"type":"boolean"
+},
+"bert_context_fmha_fp32_acc":{
+"default":false,
+"description":"Enable the FP32 accumulator for context FMHA in the bert_attention_plugin. If disabled, FP16 is used, better performance but potentially worse accuracy is expected.",
+"title":"Bert Context Fmha Fp32 Acc",
+"type":"boolean"
+},
+"paged_kv_cache":{
+"anyOf":[
+{
+"type":"boolean"
+},
+{
+"type":"null"
+}
+],
+"default":null,
+"description":"Enable paged KV cache, which helps manage memory for the KV cache more efficiently, and usually leads to an increase in the batch size and an improved efficiency.",
+"title":"Paged Kv Cache"
+},
+"remove_input_padding":{
+"default":true,
+"description":"Pack different tokens together, which reduces both the amount of computations and memory consumption.",
+"title":"Remove Input Padding",
+"type":"boolean"
+},
+"norm_quant_fusion":{
+"default":false,
+"description":"Fuse the LayerNorm and quantization kernels into a single kernel, resulting in improved end-to-end performance.",
+"title":"Norm Quant Fusion",
+"type":"boolean"
+},
+"reduce_fusion":{
+"default":false,
+"description":"Fuse the ResidualAdd and LayerNorm kernels after AllReduce into a single kernel, resulting in improved end-to-end performance.",
+"title":"Reduce Fusion",
+"type":"boolean"
+},
+"user_buffer":{
+"default":false,
+"description":"Eliminate extra copies from the local buffer to the shared buffer in the communication kernel, leading to improved end-to-end performance. This feature must be enabled with `--reduce_fusion enable` and is currently only supported for the FP8 LLAMA model.",
+"title":"User Buffer",
+"type":"boolean"
+},
+"tokens_per_block":{
+"default":32,
+"description":"Define how many tokens are contained in each paged kv cache block.",
+"title":"Tokens Per Block",
+"type":"integer"
+},
+"use_paged_context_fmha":{
+"default":true,
+"description":"Allow advanced features like KV cache reuse and chunked context.",
+"title":"Use Paged Context Fmha",
+"type":"boolean"
+},
+"use_fp8_context_fmha":{
+"default":true,
+"description":"When FP8 quantization is activated, the attention can be further accelerated by enabling FP8 Context FMHA",
+"title":"Use Fp8 Context Fmha",
+"type":"boolean"
+},
+"fuse_fp4_quant":{
+"default":false,
+"description":"Whether to fuse FP4 quantization into attention kernel.",
+"title":"Fuse Fp4 Quant",
+"type":"boolean"
+},
+"multiple_profiles":{
+"default":false,
+"description":"Enables multiple TensorRT optimization profiles in the built engines, will benefits the performance especially when GEMM plugin is disabled, because more optimization profiles help TensorRT have more chances to select better kernels. Note: This feature increases engine build time but no other adverse effects are expected.",
+"title":"Multiple Profiles",
+"type":"boolean"
+},
+"paged_state":{
+"default":true,
+"description":"Enable paged state, which helps manage memory for the RNN state more efficiently.",
+"title":"Paged State",
+"type":"boolean"
+},
+"streamingllm":{
+"default":false,
+"description":"Enable [StreamingLLM](https://arxiv.org/abs/2309.17453), which uses a window attention to perform efficient and stable LLM on long texts.",
+"title":"Streamingllm",
+"type":"boolean"
+},
+"manage_weights":{
+"default":false,
+"description":"Enable TensorRT LLM managed weights to speed up engine building process.",
+"title":"Manage Weights",
+"type":"boolean"
+},
+"use_fused_mlp":{
+"default":true,
+"description":"Enable horizontal fusion in Gated-MLP that combines two Matmul operations into a single one followed by a separate SwiGLU kernel.",
+"title":"Use Fused Mlp",
+"type":"boolean"
+},
+"pp_reduce_scatter":{
+"default":false,
+"description":"Enable a pipeline parallelism optimization with ReduceScatter + AllGather targeting large MoE models.",
+"title":"Pp Reduce Scatter",
+"type":"boolean"
+}
+}
+}
+
Enable the FP32 accumulator for context FMHA in the bert_attention_plugin. If disabled, FP16 is used, better performance but potentially worse accuracy is expected.
The quantized GEMM for fp8, which uses per token dynamic scales for activation and per channel static scales for weights.Note: It also requires same calibration in checkpoint.
The GEMM plugin that utilizes NVIDIA cuBLASLt to perform GEMM operations. Note: it’s only affective for non-quantized gemm operations (except FP8).Note: For FP8, it also requires same calibration in checkpoint.
The GEMM + SwiGLU fusion in Gated-MLP combines two Matmul operations and one SwiGLU operation into a single kernel. Currently this is only supported for FP8 precision on Hopper.
Enables multiple TensorRT optimization profiles in the built engines, will benefits the performance especially when GEMM plugin is disabled, because more optimization profiles help TensorRT have more chances to select better kernels. Note: This feature increases engine build time but no other adverse effects are expected.
Enable paged KV cache, which helps manage memory for the KV cache more efficiently, and usually leads to an increase in the batch size and an improved efficiency.
The quantized GEMM from [QServe](https://arxiv.org/abs/2405.04532), which employs 4-bit quantization for weights and 8-bit quantization for activations.
Eliminate extra copies from the local buffer to the shared buffer in the communication kernel, leading to improved end-to-end performance. This feature must be enabled with –reduce_fusion enable and is currently only supported for the FP8 LLAMA model.
TensorRT-LLM C++ runtime is using stream-ordered memory allocator to allocate and free buffers, see BufferManager::initMemoryPool, which uses the default memory pool managed by the CUDA driver. When a TrtGptModel object is destroyed, memory is returned to the memory pool and can be reused by the next instance of a TrtGptModel object. Memory will be released from the pool if it is required for other memory allocations.
-
However, nvidia-smi may still show high memory occupation after memory is returned to the CUDA driver’s memory pool. This should not be a concern and is intended behavior. The amount of reserved and free memory in the pool can be inspected by BufferManager::memoryPoolReserved()) and BufferManager::memoryPoolFree()), respectively.
+
TensorRT-LLM C++ runtime is using stream-ordered memory allocator to allocate and free buffers, see BufferManager::initMemoryPool, which uses the default memory pool managed by the CUDA driver. When a TrtGptModel object is destroyed, memory is returned to the memory pool and can be reused by the next instance of a TrtGptModel object. Memory will be released from the pool if it is required for other memory allocations.
+
However, nvidia-smi may still show high memory occupation after memory is returned to the CUDA driver’s memory pool. This should not be a concern and is intended behavior. The amount of reserved and free memory in the pool can be inspected by BufferManager::memoryPoolReserved()) and BufferManager::memoryPoolFree()), respectively.
@@ -562,7 +566,7 @@ maintaining the accuracy of the network (on downstream tasks).
weights of the model. TensorRT-LLM includes scripts to prepare the model to
run using the SmoothQuant method.
Examples of how to enable SmoothQuant for GPT, GPT-J and LLaMA can be found in
-the examples/quantization folder of that release.
@@ -571,8 +575,8 @@ a model and dequantizing those weights on-the-fly in linear layers (Matmuls).
The activations are encoded using floating-point values (FP16 or BF16).
To use INT4/INT8 Weight-Only methods, the user must determine the scaling
factors to use to quantize and dequantize the weights of the model.
-
This release includes examples for GPT and
-LLaMA.
+
This release includes examples for GPT and
+LLaMA.
@@ -679,9 +683,9 @@ This feature is currently in beta, and the related API is subjected to change in
diff --git a/latest/llm-api/index.html b/latest/llm-api/index.html
index c78f5c20be..90b022fbc3 100644
--- a/latest/llm-api/index.html
+++ b/latest/llm-api/index.html
@@ -59,7 +59,7 @@
@@ -71,7 +71,7 @@
-
+
@@ -337,6 +337,7 @@
max_batch_size (Optional[int]) – stable The maximum batch size. Defaults to None.
max_input_len (Optional[int]) – stable The maximum input length. Defaults to None.
max_seq_len (Optional[int]) – stable The maximum sequence length. Defaults to None.
max_beam_width (Optional[int]) – stable The maximum beam width. Defaults to None.
-
max_num_tokens (Optional[int]) – stable The maximum number of tokens. Defaults to None.
+
max_num_tokens (Optional[int]) – stable The maximum number of tokens. Defaults to 8192.
gather_generation_logits (bool) – prototype Gather generation logits. Defaults to False.
num_postprocess_workers (int) – prototype The number of processes used for postprocessing the generated tokens, including detokenization. Defaults to 0.
postprocess_tokenizer_dir (Optional[str]) – prototype The path to the tokenizer directory for postprocessing. Defaults to None.
reasoning_parser (Optional[str]) – prototype The parser to separate reasoning content from output. Defaults to None.
return_perf_metrics (bool) – prototype Return perf metrics. Defaults to False.
+
orchestrator_type (Optional[Literal['rpc', 'ray']]) – prototype The orchestrator type to use. Defaults to None, which uses MPI. Defaults to None.
garbage_collection_gen0_threshold (int) – beta Threshold for Python garbage collection of generation 0 objects.Lower values trigger more frequent garbage collection. Defaults to 20000.
cuda_graph_config (Optional[tensorrt_llm.llmapi.llm_args.CudaGraphConfig]) – beta CUDA graph config.If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_config.batch_sizes, and are enabled for batches that consist of decoding requests only (the reason is that it’s hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory. Defaults to None.
scheduling_params (tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], optional) – Scheduling parameters. Defaults to None.
+
cache_salt (str, Sequence[str], optional) – If specified, KV cache will be salted with the provided string to limit the kv cache reuse to the requests with the same string. Defaults to None.
Returns:
@@ -1113,6 +1121,10 @@ after prompts have been submitted.
@@ -1133,6 +1145,8 @@ after prompts have been submitted.
finish_reason (Literal['stop', 'length', 'timeout', 'cancelled'], optional) – The reason why the sequence is finished. Defaults to None.
stop_reason (int, str, optional) – The stop string or token id that caused the completion to stop, None if the completion finished for some other reason. Defaults to None.
generation_logits (torch.Tensor, optional) – The logits on the generated output token ids. Defaults to None.
+
additional_context_outputs (Dict[str, torch.Tensor], optional) – The additional context outputs. Defaults to None.
+
additional_generation_outputs (Dict[str, torch.Tensor], optional) – The additional generation outputs. Defaults to None.
disaggregated_params (tensorrt_llm.disaggregated_params.DisaggregatedParams, optional) – Parameters needed for disaggregated serving. Includes the type of request, the first generated tokens, the context request id and the any additional state needing to be transferred from context and generation instances. Defaults to None.
request_perf_metrics (tensorrt_llm.bindings.executor.RequestPerfMetrics, optional) – Performance metrics for the request. Defaults to None.
@@ -1201,6 +1215,10 @@ after prompts have been submitted.
@@ -1834,13 +1876,25 @@ The BatchedLogitsProcessor class is recommended for callback creation. The callb
n (int) – Number of sequences to generate. Defaults to 1.
best_of (int, optional) – Number of sequences to consider for best output. Defaults to None.
use_beam_search (bool) – Whether to use beam search. Defaults to False.
-
top_k (int, optional) – Controls number of logits to sample from. None means using C++ runtime default 0, i.e., all logits. Defaults to None.
-
top_p (float, optional) – Controls the top-P probability to sample from. None means using C++ runtime default 0.f. Defaults to None.
+
top_k (int, optional) – Controls number of logits to sample from. Can assume non-negative values, where 0 means ‘all logits’. Defaults to None.
+The value None is treated as “not specified” in the following.
+If neither temperature, top_p, nor top_k are specified, sampling is greedy.
+If temperature > 0 and/or top_p < 1 are specified, sampling will proceed accordingly and top_k will default to top_k = 0.
+Setting top_k = 1 results in greedy sampling.
+
top_p (float, optional) – Controls the top-P probability to sample from. Can have values between 0 and 1. Defaults to None.
+The value None is treated as “not specified” in the following.
+If neither temperature, top_p, nor top_k are specified, sampling is greedy.
+If temperature > 0 and/or top_k > 1 are specified, sampling will proceed accordingly and top_p will default to top_p = 1.
+Setting top_p = 0 should result in greedy sampling, but is currently disallowed in the backend.
top_p_min (float, optional) – Controls decay in the top-P algorithm. topPMin is lower-bound. None means using C++ runtime default 1.e-6. Defaults to None.
top_p_reset_ids (int, optional) – Controls decay in the top-P algorithm. Indicates where to reset the decay. None means using C++ runtime default 1. Defaults to None.
top_p_decay (float, optional) – Controls decay in the top-P algorithm. The decay value. None means using C++ runtime default 1.f. Defaults to None.
seed (int, optional) – Controls the random seed used by the random number generator in sampling. None means using C++ runtime default 0. Defaults to None.
-
temperature (float, optional) – Controls the modulation of logits when sampling new tokens. It can have values > 0.f. None means using C++ runtime default 1.0f. Defaults to None.
+
temperature (float, optional) – Controls the modulation of logits when sampling new tokens. It can have values >= 0.f. Defaults to None.
+The value None is treated as “not specified” in the following.
+If neither temperature, top_p, nor top_k are specified, sampling is greedy.
+If top_p < 1 and/or top_k > 1 are specified, sampling will proceed accordingly and temperature will default to temperature = 1.
+Setting temperature = 0 results in greedy sampling.
min_tokens (int, optional) – Lower bound on the number of tokens to generate. Values < 1 have no effect. None means using C++ runtime default 1. Defaults to None.
beam_search_diversity_rate (float, optional) – Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None.
repetition_penalty (float, optional) – Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None.
@@ -1858,7 +1912,7 @@ The BatchedLogitsProcessor class is recommended for callback creation. The callb
exclude_input_from_output (bool) – Controls if output tokens in Result should include the input tokens. Defaults to True.
return_encoder_output (bool) – Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Defaults to False.
return_perf_metrics (bool) – Controls if Result should contain the performance metrics for this request. Defaults to False.
-
additional_model_outputs (List[tensorrt_llm.sampling_params.AdditionalModelOutput], optional) – The additional outputs to gather from the model. Defaults to None.
+
additional_model_outputs (List[str], optional) – The additional outputs to gather from the model. Defaults to None.
Creates a new instance of the Model class with validated data.
+
Creates a new model setting __dict__ and __pydantic_fields_set__ from trusted or pre-validated data.
+Default values are respected, but no other validation is performed.
+
+
!!! note
model_construct() generally respects the model_config.extra setting on the provided model.
+That is, if model_config.extra == ‘allow’, then all extra passed values are added to the model instance’s __dict__
+and __pydantic_extra__ fields. If model_config.extra == ‘ignore’ (the default), then all extra passed values are ignored.
+Because no validation is performed with a call to model_construct(), having model_config.extra == ‘forbid’ does not result in
+an error if extra values are passed, but they will be ignored.
+
+
+
+
Parameters:
+
+
_fields_set – A set of field names that were originally explicitly set during instantiation. If provided,
+this is directly used for the [model_fields_set][pydantic.BaseModel.model_fields_set] attribute.
+Otherwise, the field names from the values argument will be used.
+
values – Trusted or pre-validated data dictionary.
+
+
+
Returns:
+
A new instance of the Model class with validated data.
The underlying instance’s [__dict__][object.__dict__] attribute is copied. This
+might have unexpected side effects if you store anything in it, on top of the model
+fields (e.g. the value of [cached properties][functools.cached_property]).
+
+
+
+
Parameters:
+
+
update – Values to change/add in the new model. Note: the data is not validated
+before creating the new model. You should trust this data.
+
deep – Set to True to make a deep copy of the model.
Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.
+
+
Parameters:
+
+
mode – The mode in which to_python should run.
+If mode is ‘json’, the output will only contain JSON serializable types.
+If mode is ‘python’, the output may contain non-JSON-serializable Python objects.
+
include – A set of fields to include in the output.
+
exclude – A set of fields to exclude from the output.
+
context – Additional context to pass to the serializer.
+
by_alias – Whether to use the field’s alias in the dictionary key if defined.
+
exclude_unset – Whether to exclude fields that have not been explicitly set.
+
exclude_defaults – Whether to exclude fields that are set to their default value.
+
exclude_none – Whether to exclude fields that have a value of None.
+
round_trip – If True, dumped values should be valid as input for non-idempotent types such as Json[T].
+
warnings – How to handle serialization errors. False/”none” ignores them, True/”warn” logs errors,
+“error” raises a [PydanticSerializationError][pydantic_core.PydanticSerializationError].
+
fallback – A function to call when an unknown value is encountered. If not provided,
+a [PydanticSerializationError][pydantic_core.PydanticSerializationError] error is raised.
+
serialize_as_any – Whether to serialize fields with duck-typing serialization behavior.
Generates a JSON representation of the model using Pydantic’s to_json method.
+
+
Parameters:
+
+
indent – Indentation to use in the JSON output. If None is passed, the output will be compact.
+
include – Field(s) to include in the JSON output.
+
exclude – Field(s) to exclude from the JSON output.
+
context – Additional context to pass to the serializer.
+
by_alias – Whether to serialize using field aliases.
+
exclude_unset – Whether to exclude fields that have not been explicitly set.
+
exclude_defaults – Whether to exclude fields that are set to their default value.
+
exclude_none – Whether to exclude fields that have a value of None.
+
round_trip – If True, dumped values should be valid as input for non-idempotent types such as Json[T].
+
warnings – How to handle serialization errors. False/”none” ignores them, True/”warn” logs errors,
+“error” raises a [PydanticSerializationError][pydantic_core.PydanticSerializationError].
+
fallback – A function to call when an unknown value is encountered. If not provided,
+a [PydanticSerializationError][pydantic_core.PydanticSerializationError] error is raised.
+
serialize_as_any – Whether to serialize fields with duck-typing serialization behavior.
Compute the class name for parametrizations of generic classes.
+
This method can be overridden to achieve a custom naming scheme for generic BaseModels.
+
+
Parameters:
+
params – Tuple of types of the class. Given a generic class
+Model with 2 type variables and a concrete model Model[str, int],
+the value (str, int) would be passed to params.
+
+
Returns:
+
String representing the new class where params are passed to cls as type variables.
+
+
Raises:
+
TypeError – Raised when trying to generate concrete names for non-generic models.
Try to rebuild the pydantic-core schema for the model.
+
This may be necessary when one of the annotations is a ForwardRef which could not be resolved during
+the initial attempt to build the schema, and automatic rebuilding fails.
+
+
Parameters:
+
+
force – Whether to force the rebuilding of the model schema, defaults to False.
+
raise_errors – Whether to raise errors, defaults to True.
+
_parent_namespace_depth – The depth level of the parent namespace, defaults to 2.
+
_types_namespace – The types namespace, defaults to None.
+
+
+
Returns:
+
Returns None if the schema is already “complete” and rebuilding was not required.
+If rebuilding _was_ required, returns True if rebuilding was successful, otherwise False.
Returns the number of layers to capture of the target model.
+If eagle3_layers_to_capture is not None, return the length of the set.
+Otherwise, assume Eagle3 base set and return 3 + 1 (for post norm last hidden state).
Creates a new instance of the Model class with validated data.
+
Creates a new model setting __dict__ and __pydantic_fields_set__ from trusted or pre-validated data.
+Default values are respected, but no other validation is performed.
+
+
!!! note
model_construct() generally respects the model_config.extra setting on the provided model.
+That is, if model_config.extra == ‘allow’, then all extra passed values are added to the model instance’s __dict__
+and __pydantic_extra__ fields. If model_config.extra == ‘ignore’ (the default), then all extra passed values are ignored.
+Because no validation is performed with a call to model_construct(), having model_config.extra == ‘forbid’ does not result in
+an error if extra values are passed, but they will be ignored.
+
+
+
+
Parameters:
+
+
_fields_set – A set of field names that were originally explicitly set during instantiation. If provided,
+this is directly used for the [model_fields_set][pydantic.BaseModel.model_fields_set] attribute.
+Otherwise, the field names from the values argument will be used.
+
values – Trusted or pre-validated data dictionary.
+
+
+
Returns:
+
A new instance of the Model class with validated data.
The underlying instance’s [__dict__][object.__dict__] attribute is copied. This
+might have unexpected side effects if you store anything in it, on top of the model
+fields (e.g. the value of [cached properties][functools.cached_property]).
+
+
+
+
Parameters:
+
+
update – Values to change/add in the new model. Note: the data is not validated
+before creating the new model. You should trust this data.
+
deep – Set to True to make a deep copy of the model.
Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.
+
+
Parameters:
+
+
mode – The mode in which to_python should run.
+If mode is ‘json’, the output will only contain JSON serializable types.
+If mode is ‘python’, the output may contain non-JSON-serializable Python objects.
+
include – A set of fields to include in the output.
+
exclude – A set of fields to exclude from the output.
+
context – Additional context to pass to the serializer.
+
by_alias – Whether to use the field’s alias in the dictionary key if defined.
+
exclude_unset – Whether to exclude fields that have not been explicitly set.
+
exclude_defaults – Whether to exclude fields that are set to their default value.
+
exclude_none – Whether to exclude fields that have a value of None.
+
round_trip – If True, dumped values should be valid as input for non-idempotent types such as Json[T].
+
warnings – How to handle serialization errors. False/”none” ignores them, True/”warn” logs errors,
+“error” raises a [PydanticSerializationError][pydantic_core.PydanticSerializationError].
+
fallback – A function to call when an unknown value is encountered. If not provided,
+a [PydanticSerializationError][pydantic_core.PydanticSerializationError] error is raised.
+
serialize_as_any – Whether to serialize fields with duck-typing serialization behavior.
Generates a JSON representation of the model using Pydantic’s to_json method.
+
+
Parameters:
+
+
indent – Indentation to use in the JSON output. If None is passed, the output will be compact.
+
include – Field(s) to include in the JSON output.
+
exclude – Field(s) to exclude from the JSON output.
+
context – Additional context to pass to the serializer.
+
by_alias – Whether to serialize using field aliases.
+
exclude_unset – Whether to exclude fields that have not been explicitly set.
+
exclude_defaults – Whether to exclude fields that are set to their default value.
+
exclude_none – Whether to exclude fields that have a value of None.
+
round_trip – If True, dumped values should be valid as input for non-idempotent types such as Json[T].
+
warnings – How to handle serialization errors. False/”none” ignores them, True/”warn” logs errors,
+“error” raises a [PydanticSerializationError][pydantic_core.PydanticSerializationError].
+
fallback – A function to call when an unknown value is encountered. If not provided,
+a [PydanticSerializationError][pydantic_core.PydanticSerializationError] error is raised.
+
serialize_as_any – Whether to serialize fields with duck-typing serialization behavior.
Compute the class name for parametrizations of generic classes.
+
This method can be overridden to achieve a custom naming scheme for generic BaseModels.
+
+
Parameters:
+
params – Tuple of types of the class. Given a generic class
+Model with 2 type variables and a concrete model Model[str, int],
+the value (str, int) would be passed to params.
+
+
Returns:
+
String representing the new class where params are passed to cls as type variables.
+
+
Raises:
+
TypeError – Raised when trying to generate concrete names for non-generic models.
Override this method to perform additional initialization after __init__ and model_construct.
+This is useful if you want to do some validation that requires the entire model to be initialized.
Try to rebuild the pydantic-core schema for the model.
+
This may be necessary when one of the annotations is a ForwardRef which could not be resolved during
+the initial attempt to build the schema, and automatic rebuilding fails.
+
+
Parameters:
+
+
force – Whether to force the rebuilding of the model schema, defaults to False.
+
raise_errors – Whether to raise errors, defaults to True.
+
_parent_namespace_depth – The depth level of the parent namespace, defaults to 2.
+
_types_namespace – The types namespace, defaults to None.
+
+
+
Returns:
+
Returns None if the schema is already “complete” and rebuilding was not required.
+If rebuilding _was_ required, returns True if rebuilding was successful, otherwise False.