Update latest GitHub pages to v1.0.0rc0

This commit is contained in:
Kaiyu Xie 2025-06-25 02:49:40 +00:00
parent c0d5a842c7
commit 66ef46ab8c
197 changed files with 4498 additions and 1913 deletions

View File

@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 8e10976759c98fbc1fa1e519991f5ea4
config: 5dd2b8f29ac03c9c53f8ad8ba1fb6dcc
tags: 645f666f9bcd5a90fca523b33c5a78b7

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -6962,6 +6962,12 @@
<dd><p>Enable guided decoding with XGrammar backend. </p>
</dd></dl>
<dl class="cpp enumerator">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend11kLLGUIDANCEE">
<span id="_CPPv3N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend11kLLGUIDANCEE"></span><span id="_CPPv2N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend11kLLGUIDANCEE"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1GuidedDecodingConfig_1a8a09e91495919291c648a0ef8c53d912ac2a7f8385259c19055da1af0f2b11bec"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kLLGUIDANCE</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend11kLLGUIDANCEE" title="Link to this definition">#</a><br /></dt>
<dd><p>Enable guided decoding with LLGuidance backend. </p>
</dd></dl>
</dd></dl>
</div>
@ -12286,6 +12292,7 @@
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfigE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::GuidedDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackendE"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingBackend</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend9kXGRAMMARE"><code class="docutils literal notranslate"><span class="pre">kXGRAMMAR</span></code></a></li>
<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend11kLLGUIDANCEE"><code class="docutils literal notranslate"><span class="pre">kLLGUIDANCE</span></code></a></li>
</ul>
</li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingConfig()</span></code></a></li>
@ -13043,9 +13050,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -13706,9 +13706,9 @@ one more than decoding draft tokens for prediction from primary head </p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -334,21 +334,16 @@ def extract_extra_attrs(layer_idx: str):
return metadata, mla_layer
@torch.library.custom_op("trtllm::mla_custom_op", mutates_args=())
def mla_custom_op(
position_ids: Optional[torch.Tensor],
@torch.library.custom_op("trtllm::mla_custom_op_inplace",
mutates_args=("output", ))
def mla_custom_op_inplace(
hidden_states: torch.Tensor,
position_ids: Optional[torch.Tensor],
layer_idx: str,
) -> torch.Tensor:
output: torch.Tensor,
) -> None:
metadata, mla_layer = extract_extra_attrs(layer_idx)
return mla_layer.forward_impl(position_ids, hidden_states, metadata)
@mla_custom_op.register_fake
def _(position_ids, hidden_states, layer_idx):
_, mla_layer = extract_extra_attrs(layer_idx)
return mla_layer.forward_impl_fake(hidden_states)
mla_layer.forward_impl(position_ids, hidden_states, metadata, output=output)
class MLA(nn.Module):
@ -671,18 +666,17 @@ class MLA(nn.Module):
self.qk_rope_head_dim)
return k_pe
def forward_impl_fake(self, hidden_states: torch.Tensor):
def create_output(self, hidden_states: torch.Tensor):
num_tokens = hidden_states.shape[0]
hidden_size = self.o_proj.in_features
return hidden_states.new_empty([num_tokens, hidden_size],
dtype=hidden_states.dtype)
def forward_impl(
self,
position_ids: Optional[torch.Tensor],
hidden_states: torch.Tensor,
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
def forward_impl(self,
position_ids: Optional[torch.Tensor],
hidden_states: torch.Tensor,
attn_metadata: AttentionMetadata,
output: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Forward pass for the MLA module.
@ -739,9 +733,15 @@ class MLA(nn.Module):
assert position_ids is not None
k_pe_ctx = self.apply_rope(q_ctx, k_pe_ctx, position_ids)
attn_output_context = self.forward_context(q_ctx, compressed_kv_ctx,
k_pe_ctx, attn_metadata,
latent_cache_ctx)
attn_output_context = self.forward_context(
q_ctx,
compressed_kv_ctx,
k_pe_ctx,
attn_metadata,
latent_cache_ctx,
output=output if num_generations == 0 else None)
if num_generations == 0:
return attn_output_context
else:
attn_output_context = None
@ -754,9 +754,15 @@ class MLA(nn.Module):
assert position_ids is not None
k_pe_gen = self.apply_rope(q_gen, k_pe_gen, position_ids)
attn_output_gen = self.forward_generation(q_gen, compressed_kv_gen,
k_pe_gen, attn_metadata,
latent_cache_gen)
attn_output_gen = self.forward_generation(
q_gen,
compressed_kv_gen,
k_pe_gen,
attn_metadata,
latent_cache_gen,
output=output if num_contexts == 0 else None)
if num_contexts == 0:
return attn_output_gen
else:
attn_output_gen = None
@ -765,25 +771,22 @@ class MLA(nn.Module):
compressed_kv = None
k_pe = None
# merge context and gen batches
if attn_output_context is not None and attn_output_gen is not None:
assert (
len(attn_output_context.shape) == 2
), f"attn_output_context must be rank 2, not {len(attn_output_context.shape)}"
assert (
len(attn_output_gen.shape) == 2
), f"attn_output_gen must be rank 2, not {len(attn_output_gen.shape)}"
attn_output = torch.cat([attn_output_context, attn_output_gen],
dim=0)
# release pytorch activation memory
attn_output_context = None
attn_output_gen = None
elif attn_output_gen is None:
attn_output = attn_output_context
else:
attn_output = attn_output_gen
return attn_output
assert attn_output_context is not None and attn_output_gen is not None
assert (
len(attn_output_context.shape) == 2
), f"attn_output_context must be rank 2, not {len(attn_output_context.shape)}"
assert (
len(attn_output_gen.shape) == 2
), f"attn_output_gen must be rank 2, not {len(attn_output_gen.shape)}"
output = output if output is not None else torch.empty(
(num_tokens, attn_output_context.shape[1]),
dtype=attn_output_context.dtype,
device=attn_output_context.device)
output[:attn_output_context.shape[0], :] = attn_output_context
output[attn_output_context.shape[0]:, :] = attn_output_gen
attn_output_context = None
attn_output_gen = None
return output
def _maybe_concat_qkv(self, q, k, v):
if k is not None and v is not None and self.support_fused_qkv:
@ -792,13 +795,13 @@ class MLA(nn.Module):
return q, k, v
def forward_context_default(
self,
q: torch.Tensor,
compressed_kv: torch.Tensor,
k_pe: torch.Tensor,
attn_metadata: AttentionMetadata,
latent_cache: Optional[torch.Tensor] = None,
) -> torch.Tensor:
self,
q: torch.Tensor,
compressed_kv: torch.Tensor,
k_pe: torch.Tensor,
attn_metadata: AttentionMetadata,
latent_cache: Optional[torch.Tensor] = None,
output: Optional[torch.Tensor] = None) -> torch.Tensor:
kv = self.kv_b_proj(compressed_kv)
k_nope, v = kv.split(
[
@ -830,6 +833,7 @@ class MLA(nn.Module):
attention_input_type=AttentionInputType.context_only,
latent_cache=latent_cache,
out_scale=out_scale,
output=output,
)
return attn_output
@ -839,6 +843,7 @@ class MLA(nn.Module):
q: torch.Tensor,
latent_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
output: Optional[torch.Tensor] = None,
) -> torch.Tensor:
assert latent_cache is not None
trtllm_attention = cast(TrtllmAttention, self.mha)
@ -912,6 +917,7 @@ class MLA(nn.Module):
mla_context_paged_kv=paged_full_kv,
mla_context_kv_cache_block_offsets=
mla_context_kv_cache_block_offsets,
output=output,
)
return attn_output
@ -923,24 +929,25 @@ class MLA(nn.Module):
k_pe: torch.Tensor,
attn_metadata: AttentionMetadata,
latent_cache: Optional[torch.Tensor] = None,
output: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if isinstance(self.mha, TrtllmAttention):
assert isinstance(attn_metadata, TrtllmAttentionMetadata)
trtllm_attention = cast(TrtllmAttention, self.mha)
if trtllm_attention.has_cached_kv_for_mla_context(attn_metadata):
return self.forward_context_with_cached_kv(
q, latent_cache, attn_metadata)
q, latent_cache, attn_metadata, output)
return self.forward_context_default(q, compressed_kv, k_pe,
attn_metadata, latent_cache)
attn_metadata, latent_cache, output)
def forward_generation(
self,
q: torch.Tensor,
compressed_kv: torch.Tensor,
k_pe: torch.Tensor,
attn_metadata: AttentionMetadata,
latent_cache: Optional[torch.Tensor] = None,
) -> torch.Tensor:
self,
q: torch.Tensor,
compressed_kv: torch.Tensor,
k_pe: torch.Tensor,
attn_metadata: AttentionMetadata,
latent_cache: Optional[torch.Tensor] = None,
output: Optional[torch.Tensor] = None) -> torch.Tensor:
num_tokens = q.shape[0]
q_nope, q_pe = q.view([-1, self.num_heads, self.qk_head_dim]).split(
[self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
@ -1011,9 +1018,13 @@ class MLA(nn.Module):
attn_out_latent = attn_out_latent.view(
[-1, self.num_heads, self.kv_lora_rank])
attn_output = torch.empty([num_tokens, self.num_heads, self.v_head_dim],
dtype=attn_out_latent.dtype,
device=attn_out_latent.device)
# [seq, num_heads * v_head_dim]
output = output if output is not None else torch.empty(
[num_tokens, self.num_heads * self.v_head_dim],
dtype=attn_out_latent.dtype,
device=attn_out_latent.device)
attn_output = output.view([num_tokens, self.num_heads, self.v_head_dim])
if self.v_b_proj.dtype == torch.bfloat16:
# [num_heads, seq, kv_lora_rank] x [num_heads, kv_lora_rank, v_head_dim]
@ -1033,8 +1044,7 @@ class MLA(nn.Module):
raise NotImplementedError(
f"Missing bmm impl for dtype: {self.v_b_proj.dtype}.")
# [seq, num_heads * v_head_dim]
return attn_output.flatten(1, 2)
return output
def forward(
self,
@ -1043,12 +1053,17 @@ class MLA(nn.Module):
attn_metadata: AttentionMetadata,
all_reduce_params: Optional[AllReduceParams] = None,
) -> torch.Tensor:
attn_output = self.create_output(hidden_states)
if self.register_to_config:
attn_output = torch.ops.trtllm.mla_custom_op(
position_ids, hidden_states, self.layer_idx_str)
torch.ops.trtllm.mla_custom_op_inplace(hidden_states, position_ids,
self.layer_idx_str,
attn_output)
else:
attn_output = self.forward_impl(position_ids, hidden_states,
attn_metadata)
self.forward_impl(position_ids,
hidden_states,
attn_metadata,
output=attn_output)
attn_output = self.o_proj(attn_output,
all_reduce_params=all_reduce_params)
return attn_output

View File

@ -207,6 +207,7 @@ class DecodingBaseConfig(BaseModel):
"Eagle": EagleDecodingConfig,
"Lookahead": LookaheadDecodingConfig,
"NGram": NGramDecodingConfig,
"DraftTarget": DraftTargetDecodingConfig,
}
config_class = config_classes.get(decoding_type)
@ -238,7 +239,7 @@ class EagleDecodingConfig(DecodingBaseConfig):
dynamic_tree_max_topK: Optional[int] = None
num_eagle_layers: Optional[int] = None
max_non_leaves_per_layer: Optional[int] = None
pytorch_eagle_weights_path: Optional[str] = None
pytorch_weights_path: Optional[str] = None
eagle3_one_model: Optional[bool] = True
@classmethod
@ -282,11 +283,22 @@ class NGramDecodingConfig(DecodingBaseConfig):
decoding_type: ClassVar[str] = "NGram"
class DraftTargetDecodingConfig(DecodingBaseConfig):
pytorch_weights_path: Optional[str] = None
@classmethod
def from_dict(cls, data: dict):
return cls(**data)
decoding_type: ClassVar[str] = "DraftTarget"
class MTPDecodingConfig(DecodingBaseConfig):
num_nextn_predict_layers: Optional[int] = 1
use_relaxed_acceptance_for_thinking: Optional[bool] = False
relaxed_topk: Optional[int] = 1
relaxed_delta: Optional[float] = 0.
use_mtp_vanilla: Optional[bool] = False
@classmethod
def from_dict(cls, data: dict):
@ -896,10 +908,11 @@ class BaseLlmArgs(BaseModel):
default=None, description="Cache transceiver config.")
# Speculative decoding parameters
speculative_config: Optional[Union[
LookaheadDecodingConfig, MedusaDecodingConfig, EagleDecodingConfig,
MTPDecodingConfig, NGramDecodingConfig]] = Field(
default=None, description="Speculative decoding config.")
speculative_config: Optional[
Union[LookaheadDecodingConfig, MedusaDecodingConfig,
EagleDecodingConfig, MTPDecodingConfig, NGramDecodingConfig,
DraftTargetDecodingConfig]] = Field(
default=None, description="Speculative decoding config.")
batching_type: Optional[BatchingType] = Field(default=None,
description="Batching type.")
@ -941,6 +954,12 @@ class BaseLlmArgs(BaseModel):
default=None,
description="The parser to separate reasoning content from output.")
garbage_collection_gen0_threshold: int = Field(
default=20000,
description=
"Threshold for Python garbage collection of generation 0 objects."
"Lower values trigger more frequent garbage collection.")
# TODO[Superjomn]: To deprecate this config.
decoding_config: Optional[object] = Field(
default=None,
@ -1296,7 +1315,7 @@ class BaseLlmArgs(BaseModel):
self.speculative_config = Eagle3Config(
max_draft_tokens=self.speculative_config.max_draft_len,
draft_model_path=self.speculative_config.
pytorch_eagle_weights_path,
pytorch_weights_path,
eagle3_one_model=self.speculative_config.
eagle3_one_model)
elif isinstance(self.speculative_config, NGramDecodingConfig):
@ -1314,6 +1333,16 @@ class BaseLlmArgs(BaseModel):
is_use_oldest=self.speculative_config.is_use_oldest,
is_public_pool=self.speculative_config.is_public_pool,
)
elif isinstance(self.speculative_config, DraftTargetDecodingConfig):
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL
assert self.backend == 'pytorch'
assert self.speculative_config.max_draft_len > 0
self.build_config.max_draft_len = self.speculative_config.max_draft_len
from tensorrt_llm._torch.speculative import DraftTargetConfig
self.speculative_config = DraftTargetConfig(
max_draft_tokens=self.speculative_config.max_draft_len,
draft_model_path=self.speculative_config.
pytorch_weights_path)
elif isinstance(self.speculative_config, MTPDecodingConfig):
from tensorrt_llm._torch.speculative import MTPConfig
self.speculative_config = MTPConfig(
@ -1323,7 +1352,8 @@ class BaseLlmArgs(BaseModel):
use_relaxed_acceptance_for_thinking=self.speculative_config.
use_relaxed_acceptance_for_thinking,
relaxed_topk=self.speculative_config.relaxed_topk,
relaxed_delta=self.speculative_config.relaxed_delta)
relaxed_delta=self.speculative_config.relaxed_delta,
use_mtp_vanilla=self.speculative_config.use_mtp_vanilla)
else:
raise ValueError(
f"Speculative config type not recognized: {self.speculative_config}"
@ -1563,12 +1593,6 @@ class TrtLlmArgs(BaseLlmArgs):
return self
LlmArgs = TrtLlmArgs
LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(LlmArgs,
indent=' ' * 4)
class LoadFormat(Enum):
AUTO = 0
# Initialize all weights randomly.
@ -1579,18 +1603,18 @@ class TorchCompileConfig(BaseModel):
"""
Configuration for torch.compile.
"""
torch_compile_fullgraph: bool = Field(
enable_fullgraph: bool = Field(
default=True,
description="Enable full graph compilation in torch.compile.")
torch_compile_inductor_enabled: bool = Field(
enable_inductor: bool = Field(
default=False, description="Enable inductor backend in torch.compile.")
torch_compile_piecewise_cuda_graph: bool = Field(
enable_piecewise_cuda_graph: bool = Field(
default=False,
description="Enable piecewise CUDA graph in torch.compile.")
torch_compile_enable_userbuffers: bool = Field(
enable_userbuffers: bool = Field(
default=True,
description=
"When torch compile is enabled, userbuffers is enabled by default.")
@ -1638,7 +1662,10 @@ class TorchLlmArgs(BaseLlmArgs):
moe_load_balancer: Optional[Union[object, str]] = Field(
default=None,
description="Configuration for MoE load balancing.",
json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
json_schema_extra={
"type":
"Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
})
attn_backend: str = Field(default='TRTLLM',
description="Attention backend to use.")
@ -1695,6 +1722,14 @@ class TorchLlmArgs(BaseLlmArgs):
"If true, enable min-latency mode. Currently only used for Llama4.",
)
# TODO: make this a per-request parameter
stream_interval: int = Field(
default=1,
description=
"The iteration interval to create responses under the streaming mode. "
"Set this to a larger value when the batch size is large, which helps reduce the streaming overhead.",
)
# TODO: remove backend later
@field_validator('backend', mode='before')
def init_backend(cls, v):
@ -1747,6 +1782,13 @@ class TorchLlmArgs(BaseLlmArgs):
) from e
return self
@model_validator(mode="after")
def validate_stream_interval(self):
if self.stream_interval <= 0:
raise ValueError(
f"stream_interval must be positive, got {self.stream_interval}")
return self
# TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
def get_pytorch_backend_config(self) -> "PyTorchConfig":
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
@ -1769,22 +1811,21 @@ class TorchLlmArgs(BaseLlmArgs):
enable_iter_req_stats=self.enable_iter_req_stats,
print_iter_log=self.print_iter_log,
torch_compile_enabled=bool(self.torch_compile_config is not None),
torch_compile_fullgraph=self.torch_compile_config.
torch_compile_fullgraph
torch_compile_fullgraph=self.torch_compile_config.enable_fullgraph
if self.torch_compile_config is not None else True,
torch_compile_inductor_enabled=self.torch_compile_config.
torch_compile_inductor_enabled
if self.torch_compile_config is not None else False,
enable_inductor if self.torch_compile_config is not None else False,
torch_compile_piecewise_cuda_graph=self.torch_compile_config.
torch_compile_piecewise_cuda_graph
enable_piecewise_cuda_graph
if self.torch_compile_config is not None else False,
torch_compile_enable_userbuffers=self.torch_compile_config.
torch_compile_enable_userbuffers
enable_userbuffers
if self.torch_compile_config is not None else True,
autotuner_enabled=self.autotuner_enabled,
enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker,
load_format=self.load_format,
enable_min_latency=self.enable_min_latency)
enable_min_latency=self.enable_min_latency,
stream_interval=self.stream_interval)
@field_validator('cuda_graph_max_batch_size')
@classmethod
@ -2040,3 +2081,12 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
)
else:
return model_format
LlmArgs = TorchLlmArgs
TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs,
indent=' ' * 4)
TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs,
indent=' ' *
4)

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -688,9 +688,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1997,9 +1997,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -668,9 +668,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -807,6 +807,9 @@
<span class="n">handler</span><span class="p">(</span><span class="n">response</span><span class="o">.</span><span class="n">error_msg</span><span class="p">)</span>
<span class="n">response_result</span> <span class="o">=</span> <span class="n">response</span><span class="o">.</span><span class="n">result</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">response_result</span><span class="p">,</span> <span class="s2">&quot;_result&quot;</span><span class="p">):</span>
<span class="n">response_result</span><span class="o">.</span><span class="n">deserialize</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_done</span> <span class="o">=</span> <span class="n">response_result</span><span class="o">.</span><span class="n">is_final</span>
<span class="n">context_phase_params</span> <span class="o">=</span> <span class="n">response_result</span><span class="o">.</span><span class="n">context_phase_params</span>
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_iter</span> <span class="o">=</span> <span class="n">response_result</span><span class="o">.</span><span class="n">decoding_iter</span>
@ -1131,6 +1134,11 @@
<span class="c1"># reshape from [1, T, V] to [T, V]</span>
<span class="n">logits</span> <span class="o">=</span> <span class="n">logits</span><span class="o">.</span><span class="n">squeeze</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="k">if</span> <span class="n">tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">logits</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="o">&gt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">tokens</span><span class="p">):</span>
<span class="c1"># WAR for nvbug 5324291 where TRT backend might return more logits</span>
<span class="c1"># than output tokens.</span>
<span class="n">logits</span> <span class="o">=</span> <span class="n">logits</span><span class="p">[:</span><span class="nb">len</span><span class="p">(</span><span class="n">tokens</span><span class="p">)]</span>
<span class="n">logprobs</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">log_softmax</span><span class="p">(</span><span class="n">logits</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">),</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
<span class="n">topk_vals</span><span class="p">,</span> <span class="n">topk_indices</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">topk</span><span class="p">(</span><span class="n">logprobs</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">top_k</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
@ -1275,9 +1283,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -511,6 +511,9 @@
<span></span><span class="kn">import</span><span class="w"> </span><span class="nn">asyncio</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">concurrent.futures</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">threading</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">traceback</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">concurrent.futures</span><span class="w"> </span><span class="kn">import</span> <span class="n">ProcessPoolExecutor</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">queue</span><span class="w"> </span><span class="kn">import</span> <span class="n">Empty</span><span class="p">,</span> <span class="n">Queue</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">NamedTuple</span><span class="p">,</span> <span class="n">Optional</span>
@ -518,12 +521,12 @@
<span class="kn">from</span><span class="w"> </span><span class="nn">strenum</span><span class="w"> </span><span class="kn">import</span> <span class="n">StrEnum</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_rank</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Response</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">print_colored_debug</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">enable_llm_debug</span><span class="p">,</span> <span class="n">print_colored_debug</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..llmapi.mpi_session</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">MpiCommSession</span><span class="p">,</span> <span class="n">MpiPoolSession</span><span class="p">,</span> <span class="n">MpiSession</span><span class="p">,</span>
<span class="n">RemoteMpiCommSessionClient</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">print_colored_debug</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
<span class="k">class</span><span class="w"> </span><span class="nc">LlmLauncherEnvs</span><span class="p">(</span><span class="n">StrEnum</span><span class="p">):</span>
@ -562,9 +565,9 @@
<span class="n">print_colored_debug</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Using RemoteMpiPoolSessionClient to bind to external MPI processes at </span><span class="si">{</span><span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">()</span><span class="si">}</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span>
<span class="s2">&quot;yellow&quot;</span><span class="p">)</span>
<span class="n">hmac_key</span> <span class="o">=</span> <span class="n">get_spawn_proxy_process_ipc_hmac_key_env</span><span class="p">()</span>
<span class="n">get_spawn_proxy_process_ipc_hmac_key_env</span><span class="p">()</span>
<span class="k">return</span> <span class="n">RemoteMpiCommSessionClient</span><span class="p">(</span>
<span class="n">addr</span><span class="o">=</span><span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">(),</span> <span class="n">hmac_key</span><span class="o">=</span><span class="n">hmac_key</span><span class="p">)</span>
<span class="n">addr</span><span class="o">=</span><span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">print_colored_debug</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Using MpiCommSession to bind to external MPI processes</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span>
@ -657,8 +660,26 @@
<span class="k">def</span><span class="w"> </span><span class="nf">is_llm_response</span><span class="p">(</span><span class="n">instance</span><span class="p">):</span>
<span class="k">return</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">instance</span><span class="p">,</span> <span class="n">Response</span><span class="p">)</span> <span class="ow">or</span> \
<span class="p">(</span><span class="nb">hasattr</span><span class="p">(</span><span class="n">instance</span><span class="p">,</span> <span class="s1">&#39;_is_llm_response&#39;</span><span class="p">)</span> <span class="ow">and</span> <span class="n">instance</span><span class="o">.</span><span class="n">_is_llm_response</span><span class="p">)</span>
<span class="k">return</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">instance</span><span class="p">,</span> <span class="s2">&quot;result&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">print_alive_threads</span><span class="p">():</span>
<span class="k">assert</span> <span class="n">enable_llm_debug</span><span class="p">(</span>
<span class="p">),</span> <span class="s2">&quot;print_alive_threads must be called with enable_llm_debug() enabled&quot;</span>
<span class="c1"># Print all alive threads for debugging</span>
<span class="n">alive_threads</span> <span class="o">=</span> <span class="p">[</span><span class="n">t</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">threading</span><span class="o">.</span><span class="n">enumerate</span><span class="p">()</span> <span class="k">if</span> <span class="n">t</span><span class="o">.</span><span class="n">is_alive</span><span class="p">()]</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
<span class="sa">f</span><span class="s1">&#39;All alive threads after shutdown: </span><span class="si">{</span><span class="p">[</span><span class="n">t</span><span class="o">.</span><span class="n">name</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">t</span><span class="w"> </span><span class="ow">in</span><span class="w"> </span><span class="n">alive_threads</span><span class="p">]</span><span class="si">}</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">,</span>
<span class="s2">&quot;red&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">alive_threads</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Thread </span><span class="si">{</span><span class="n">t</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s1"> (daemon=</span><span class="si">{</span><span class="n">t</span><span class="o">.</span><span class="n">daemon</span><span class="si">}</span><span class="s1">) is still alive&#39;</span><span class="p">)</span>
<span class="c1"># Get the stack trace for this thread</span>
<span class="n">stack</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">_current_frames</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">t</span><span class="o">.</span><span class="n">ident</span><span class="p">)</span>
<span class="k">if</span> <span class="n">stack</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;Stack trace for thread </span><span class="si">{</span><span class="n">t</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="s1">:&#39;</span><span class="p">)</span>
<span class="n">traceback</span><span class="o">.</span><span class="n">print_stack</span><span class="p">(</span><span class="n">stack</span><span class="p">,</span> <span class="n">file</span><span class="o">=</span><span class="n">sys</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;&#39;</span><span class="p">)</span>
</pre></div>
</article>
@ -770,9 +791,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -8722,9 +8722,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -647,9 +647,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -3512,9 +3512,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -654,9 +654,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -903,9 +903,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1370,9 +1370,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1218,9 +1218,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1244,9 +1244,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1008,9 +1008,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -663,9 +663,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -947,9 +947,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -541,12 +541,14 @@
<span class="n">create_input_processor_with_hash</span><span class="p">,</span> <span class="n">prompt_inputs</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..sampling_params</span><span class="w"> </span><span class="kn">import</span> <span class="n">SamplingParams</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">PybindMirror</span><span class="p">,</span> <span class="n">TorchLlmArgs</span><span class="p">,</span>
<span class="n">TrtLlmArgs</span><span class="p">,</span> <span class="n">_AutoDeployLlmArgs</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">TORCH_LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span>
<span class="n">TRT_LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">PybindMirror</span><span class="p">,</span>
<span class="n">TorchLlmArgs</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">,</span> <span class="n">_AutoDeployLlmArgs</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_utils</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">CachedModelLoader</span><span class="p">,</span> <span class="n">KvCacheRetentionConfig</span><span class="p">,</span>
<span class="n">LlmBuildStats</span><span class="p">,</span> <span class="n">ModelLoader</span><span class="p">,</span> <span class="n">_ModelRuntimeContext</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.mpi_session</span><span class="w"> </span><span class="kn">import</span> <span class="n">MpiPoolSession</span><span class="p">,</span> <span class="n">external_mpi_comm_available</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.tokenizer</span><span class="w"> </span><span class="kn">import</span> <span class="n">TokenizerBase</span><span class="p">,</span> <span class="n">_xgrammar_tokenizer_info</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.tokenizer</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">TokenizerBase</span><span class="p">,</span> <span class="n">_llguidance_tokenizer_info</span><span class="p">,</span>
<span class="n">_xgrammar_tokenizer_info</span><span class="p">)</span>
<span class="c1"># TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.utils</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">append_docstring</span><span class="p">,</span> <span class="n">exception_handler</span><span class="p">,</span> <span class="n">get_device_count</span><span class="p">,</span>
<span class="n">print_colored_debug</span><span class="p">)</span>
@ -599,8 +601,7 @@
<span class="n">LLM_DOCSTRING</span> <span class="o">=</span> <span class="n">LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="s2"> kwargs (Any): Advanced arguments passed to `LlmArgs`.</span>
<span class="n">TRT_LLM_DOCSTRING</span> <span class="o">=</span> <span class="n">TRT_LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="s2"> Attributes:</span>
<span class="s2"> tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.</span>
@ -608,18 +609,19 @@
<span class="s2"> llm_id (str): The unique ID of the LLM instance.</span>
<span class="s2">&quot;&quot;&quot;</span>
<span class="n">TORCH_LLM_DOCSTRING</span> <span class="o">=</span> <span class="n">TORCH_LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">+</span> <span class="s2">&quot;&quot;&quot;</span>
<div class="viewcode-block" id="LLM">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM">[docs]</a>
<span class="nd">@append_docstring</span><span class="p">(</span><span class="n">LLM_DOCSTRING</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">LLM</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;LLM class is the main class for running a LLM model.</span>
<span class="s2"> Attributes:</span>
<span class="s2"> tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.</span>
<span class="s2"> llm_id (str): The unique ID of the LLM instance.</span>
<span class="s2">&quot;&quot;&quot;</span>
<span class="sd"> Parameters:</span>
<span class="sd">&quot;&quot;&quot;</span>
<div class="viewcode-block" id="LLM.__init__">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.__init__">[docs]</a>
<span class="k">class</span><span class="w"> </span><span class="nc">BaseLLM</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The base class for all LLM classes.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
<span class="n">model</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">],</span>
<span class="n">tokenizer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">,</span> <span class="n">TokenizerBase</span><span class="p">,</span>
@ -706,6 +708,8 @@
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">TemporaryDirectory</span><span class="p">(</span>
<span class="n">suffix</span><span class="o">=</span><span class="s2">&quot;-llm-workspace&quot;</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">workspace</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span> <span class="o">=</span> <span class="kc">None</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_hf_model_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
@ -720,12 +724,7 @@
<span class="k">raise</span>
<span class="n">exception_handler</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s1">&#39;shutdown&#39;</span><span class="p">)</span>
<span class="n">atexit</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="n">LLM</span><span class="o">.</span><span class="n">_shutdown_wrapper</span><span class="p">,</span> <span class="n">weakref</span><span class="o">.</span><span class="n">ref</span><span class="p">(</span><span class="bp">self</span><span class="p">))</span></div>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">workspace</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Path</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span>
<span class="n">atexit</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="n">LLM</span><span class="o">.</span><span class="n">_shutdown_wrapper</span><span class="p">,</span> <span class="n">weakref</span><span class="o">.</span><span class="n">ref</span><span class="p">(</span><span class="bp">self</span><span class="p">))</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">llm_id</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
@ -737,8 +736,6 @@
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_llm_id</span>
<div class="viewcode-block" id="LLM.generate">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.generate">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">generate</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">inputs</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">PromptInputs</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">PromptInputs</span><span class="p">]],</span>
@ -812,11 +809,8 @@
<span class="k">if</span> <span class="n">unbatched</span><span class="p">:</span>
<span class="n">futures</span> <span class="o">=</span> <span class="n">futures</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">return</span> <span class="n">futures</span></div>
<span class="k">return</span> <span class="n">futures</span>
<div class="viewcode-block" id="LLM.generate_async">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.generate_async">[docs]</a>
<span class="nd">@nvtx_range_debug</span><span class="p">(</span><span class="s2">&quot;LLM.generate_async&quot;</span><span class="p">,</span> <span class="n">color</span><span class="o">=</span><span class="s2">&quot;green&quot;</span><span class="p">,</span> <span class="n">category</span><span class="o">=</span><span class="s2">&quot;LLM&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">generate_async</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
@ -934,11 +928,8 @@
<span class="p">)</span>
<span class="k">return</span> <span class="n">RequestOutput</span><span class="o">.</span><span class="n">_from_generation_result</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">)</span></div>
<span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">)</span>
<div class="viewcode-block" id="LLM.get_stats">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.get_stats">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">get_stats</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">2</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">dict</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&#39;&#39;&#39;Get iteration statistics from the runtime.</span>
<span class="sd"> To collect statistics, call this function after prompts have been submitted with LLM().generate().</span>
@ -950,11 +941,8 @@
<span class="sd"> List[dict]: A list of runtime stats as dict.</span>
<span class="sd"> e.g., [&#39;{&quot;cpuMemUsage&quot;: ..., &quot;iter&quot;: 0, ...}&#39;, &#39;{&quot;cpuMemUsage&quot;: ..., &quot;iter&quot;: 1, ...}&#39;]</span>
<span class="sd"> &#39;&#39;&#39;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">get_stats</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span></div>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">get_stats</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span>
<div class="viewcode-block" id="LLM.get_stats_async">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.get_stats_async">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">get_stats_async</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">2</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IterationResult</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&#39;&#39;&#39;Get iteration statistics from the runtime.</span>
<span class="sd"> To collect statistics, you can call this function in an async coroutine or the /metrics endpoint (if you&#39;re using trtllm-serve)</span>
@ -966,11 +954,8 @@
<span class="sd"> Returns:</span>
<span class="sd"> tensorrt_llm.executor.result.IterationResult: An async iterable object containing runtime stats.</span>
<span class="sd"> &#39;&#39;&#39;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">aget_stats</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span></div>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">aget_stats</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span>
<div class="viewcode-block" id="LLM.get_kv_cache_events">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.get_kv_cache_events">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">get_kv_cache_events</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">2</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">dict</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&#39;&#39;&#39;Get iteration KV events from the runtime.</span>
@ -990,11 +975,8 @@
<span class="sd"> Returns:</span>
<span class="sd"> List[dict]: A list of runtime events as dict.</span>
<span class="sd"> &#39;&#39;&#39;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">get_kv_events</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span></div>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">get_kv_events</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span>
<div class="viewcode-block" id="LLM.get_kv_cache_events_async">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.get_kv_cache_events_async">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">get_kv_cache_events_async</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
<span class="n">timeout</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">2</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IterationResult</span><span class="p">:</span>
@ -1016,8 +998,7 @@
<span class="sd"> Returns:</span>
<span class="sd"> tensorrt_llm.executor.result.IterationResult: An async iterable object containing runtime events.</span>
<span class="sd"> &#39;&#39;&#39;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">aget_kv_events</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span></div>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">aget_kv_events</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_prepare_sampling_params</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
@ -1123,7 +1104,7 @@
<span class="k">def</span><span class="w"> </span><span class="nf">_build_model</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">model_loader</span> <span class="o">=</span> <span class="n">CachedModelLoader</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span>
<span class="n">mpi_session</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span><span class="p">,</span>
<span class="n">workspace</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">workspace</span><span class="p">,</span>
<span class="n">workspace</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="p">,</span>
<span class="n">llm_build_stats</span><span class="o">=</span><span class="n">weakref</span><span class="o">.</span><span class="n">proxy</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">llm_build_stats</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_hf_model_dir</span> <span class="o">=</span> <span class="n">model_loader</span><span class="p">()</span>
@ -1200,6 +1181,11 @@
<span class="n">backend</span><span class="o">=</span><span class="n">tllm</span><span class="o">.</span><span class="n">GuidedDecodingConfig</span><span class="o">.</span><span class="n">GuidedDecodingBackend</span><span class="o">.</span>
<span class="n">XGRAMMAR</span><span class="p">,</span>
<span class="o">**</span><span class="n">_xgrammar_tokenizer_info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">))</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">guided_decoding_backend</span> <span class="o">==</span> <span class="s1">&#39;llguidance&#39;</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_executor_config</span><span class="o">.</span><span class="n">guided_decoding_config</span> <span class="o">=</span> <span class="n">tllm</span><span class="o">.</span><span class="n">GuidedDecodingConfig</span><span class="p">(</span>
<span class="n">backend</span><span class="o">=</span><span class="n">tllm</span><span class="o">.</span><span class="n">GuidedDecodingConfig</span><span class="o">.</span><span class="n">GuidedDecodingBackend</span><span class="o">.</span>
<span class="n">LLGUIDANCE</span><span class="p">,</span>
<span class="o">**</span><span class="n">_llguidance_tokenizer_info</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">))</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">guided_decoding_backend</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Unrecognized guided decoding backend </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">guided_decoding_backend</span><span class="si">}</span><span class="s2">&quot;</span>
@ -1247,7 +1233,9 @@
<span class="n">postprocess_tokenizer_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">postprocess_tokenizer_dir</span><span class="p">,</span>
<span class="p">),</span>
<span class="n">is_llm_executor</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">lora_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="p">)</span>
<span class="n">lora_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="p">,</span>
<span class="n">garbage_collection_gen0_threshold</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span>
<span class="n">garbage_collection_gen0_threshold</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_on_trt_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
@ -1303,8 +1291,66 @@
<span class="k">def</span><span class="w"> </span><span class="nf">tokenizer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokenizer</span><span class="p">:</span> <span class="n">TokenizerBase</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_tokenizer</span> <span class="o">=</span> <span class="n">tokenizer</span>
<div class="viewcode-block" id="LLM.save">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.save">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;_executor&quot;</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_executor</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s1">&#39;mpi_session&#39;</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span> <span class="o">=</span> <span class="kc">None</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_shutdown_wrapper</span><span class="p">(</span><span class="n">self_ref</span><span class="p">):</span>
<span class="c1"># Retrieve the instance if it still exists</span>
<span class="n">instance</span> <span class="o">=</span> <span class="n">self_ref</span><span class="p">()</span>
<span class="k">if</span> <span class="n">instance</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">instance</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__exit__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">exc_type</span><span class="p">,</span> <span class="n">exc_value</span><span class="p">,</span> <span class="n">traceback</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">del</span> <span class="n">exc_value</span><span class="p">,</span> <span class="n">traceback</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="k">return</span> <span class="kc">False</span> <span class="c1"># propagate exceptions</span>
<span class="k">def</span><span class="w"> </span><span class="nf">__getstate__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">&quot;LLM object can not be pickled.&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__del__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="nd">@append_docstring</span><span class="p">(</span><span class="n">TRT_LLM_DOCSTRING</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">_TrtLLM</span><span class="p">(</span><span class="n">BaseLLM</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;LLM class is the main class for running a LLM model using TensorRT-LLM backend.</span>
<span class="sd"> Parameters:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
<span class="n">model</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">],</span>
<span class="n">tokenizer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">,</span> <span class="n">TokenizerBase</span><span class="p">,</span>
<span class="n">PreTrainedTokenizerBase</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tokenizer_mode</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s1">&#39;auto&#39;</span><span class="p">,</span> <span class="s1">&#39;slow&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;auto&#39;</span><span class="p">,</span>
<span class="n">skip_tokenizer_init</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">tensor_parallel_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">revision</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tokenizer_revision</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># TODO: deprecate backend in LLM kwargs</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">tokenizer</span><span class="p">,</span> <span class="n">tokenizer_mode</span><span class="p">,</span> <span class="n">skip_tokenizer_init</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="p">,</span> <span class="n">tensor_parallel_size</span><span class="p">,</span> <span class="n">dtype</span><span class="p">,</span>
<span class="n">revision</span><span class="p">,</span> <span class="n">tokenizer_revision</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">workspace</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Path</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span>
<span class="k">def</span><span class="w"> </span><span class="nf">save</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">engine_dir</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Save the built engine to the given path.</span>
@ -1328,42 +1374,100 @@
<span class="k">for</span> <span class="n">file</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="o">.</span><span class="n">iterdir</span><span class="p">():</span>
<span class="n">print_colored_debug</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Copying </span><span class="si">{</span><span class="n">file</span><span class="si">}</span><span class="s2"> to </span><span class="si">{</span><span class="n">target_engine_dir</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="n">file</span><span class="o">.</span><span class="n">name</span><span class="si">}</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">file</span><span class="p">,</span> <span class="n">target_engine_dir</span> <span class="o">/</span> <span class="n">file</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div>
<span class="n">shutil</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">file</span><span class="p">,</span> <span class="n">target_engine_dir</span> <span class="o">/</span> <span class="n">file</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<div class="viewcode-block" id="LLM.shutdown">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.shutdown">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;_executor&quot;</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_executor</span> <span class="o">=</span> <span class="kc">None</span>
<span class="nd">@append_docstring</span><span class="p">(</span><span class="n">TORCH_LLM_DOCSTRING</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">_TorchLLM</span><span class="p">(</span><span class="n">BaseLLM</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;LLM class is the main class for running a LLM model using PyTorch backend.</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s1">&#39;mpi_session&#39;</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span> <span class="o">=</span> <span class="kc">None</span></div>
<span class="sd"> Parameters:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
<span class="n">model</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">],</span>
<span class="n">tokenizer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">,</span> <span class="n">TokenizerBase</span><span class="p">,</span>
<span class="n">PreTrainedTokenizerBase</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tokenizer_mode</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s1">&#39;auto&#39;</span><span class="p">,</span> <span class="s1">&#39;slow&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;auto&#39;</span><span class="p">,</span>
<span class="n">skip_tokenizer_init</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">tensor_parallel_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">revision</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tokenizer_revision</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># TODO: deprecate backend in LLM kwargs</span>
<span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s2">&quot;backend&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="c1"># Validate that users don&#39;t pass TrtLlmArgs-specific arguments</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_args_for_torch_backend</span><span class="p">(</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">model</span><span class="p">,</span>
<span class="n">tokenizer</span><span class="p">,</span>
<span class="n">tokenizer_mode</span><span class="p">,</span>
<span class="n">skip_tokenizer_init</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="p">,</span>
<span class="n">tensor_parallel_size</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">,</span>
<span class="n">revision</span><span class="p">,</span>
<span class="n">tokenizer_revision</span><span class="p">,</span>
<span class="n">backend</span><span class="o">=</span><span class="s1">&#39;pytorch&#39;</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_validate_args_for_torch_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">kwargs</span><span class="p">:</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Validate that users don&#39;t pass TrtLlmArgs-specific arguments when using PyTorch backend.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">trtllm_fields</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">TrtLlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="n">torchllm_fields</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">TorchLlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
<span class="n">trtllm_specific_fields</span> <span class="o">=</span> <span class="n">trtllm_fields</span> <span class="o">-</span> <span class="n">torchllm_fields</span>
<span class="c1"># Check if any TrtLlmArgs-specific arguments are passed</span>
<span class="n">trtllm_specific_args</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="p">:</span>
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">trtllm_specific_fields</span><span class="p">:</span>
<span class="n">trtllm_specific_args</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="k">if</span> <span class="n">trtllm_specific_args</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;The following arguments are specific to TensorRT backend and cannot be used with PyTorch backend: </span><span class="si">{</span><span class="n">trtllm_specific_args</span><span class="si">}</span><span class="s2">.</span><span class="se">\n</span><span class="s2">&quot;</span>
<span class="sa">f</span><span class="s2">&quot;Please use &#39;from tensorrt_llm._tensorrt_engine import LLM&#39; instead to use the TensorRT backend.&quot;</span>
<span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_shutdown_wrapper</span><span class="p">(</span><span class="n">self_ref</span><span class="p">):</span>
<span class="c1"># Retrieve the instance if it still exists</span>
<span class="n">instance</span> <span class="o">=</span> <span class="n">self_ref</span><span class="p">()</span>
<span class="k">if</span> <span class="n">instance</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">instance</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<div class="viewcode-block" id="LLM">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM">[docs]</a>
<span class="k">class</span><span class="w"> </span><span class="nc">LLM</span><span class="p">(</span><span class="n">_TorchLLM</span><span class="p">):</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
<span class="n">model</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">],</span>
<span class="n">tokenizer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">,</span> <span class="n">TokenizerBase</span><span class="p">,</span>
<span class="n">PreTrainedTokenizerBase</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tokenizer_mode</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s1">&#39;auto&#39;</span><span class="p">,</span> <span class="s1">&#39;slow&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;auto&#39;</span><span class="p">,</span>
<span class="n">skip_tokenizer_init</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">tensor_parallel_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">revision</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tokenizer_revision</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">tokenizer</span><span class="p">,</span> <span class="n">tokenizer_mode</span><span class="p">,</span> <span class="n">skip_tokenizer_init</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="p">,</span> <span class="n">tensor_parallel_size</span><span class="p">,</span> <span class="n">dtype</span><span class="p">,</span>
<span class="n">revision</span><span class="p">,</span> <span class="n">tokenizer_revision</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<span class="k">def</span><span class="w"> </span><span class="fm">__exit__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">exc_type</span><span class="p">,</span> <span class="n">exc_value</span><span class="p">,</span> <span class="n">traceback</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">del</span> <span class="n">exc_value</span><span class="p">,</span> <span class="n">traceback</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="k">return</span> <span class="kc">False</span> <span class="c1"># propagate exceptions</span>
<span class="k">def</span><span class="w"> </span><span class="nf">__getstate__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">&quot;LLM object can not be pickled.&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__del__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span></div>
<span class="n">_LLM_REPR</span> <span class="o">=</span> <span class="s2">&quot;TorchLLM&quot;</span>
<span class="c1"># sphinx will ignore the LLM&#39;s docstring if it is not explicitly set</span>
<span class="n">LLM</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> \
<span class="sa">f</span><span class="s2">&quot;&quot;&quot;LLM class is the main class for running a LLM model.</span>
<span class="s2"> This class is an alias of </span><span class="si">{</span><span class="n">_LLM_REPR</span><span class="si">}</span><span class="s2">.</span>
<span class="s2"> Parameters:</span>
<span class="s2">&quot;&quot;&quot;</span> <span class="o">+</span> <span class="n">TORCH_LLM_DOCSTRING</span>
</pre></div>
</article>
@ -1475,9 +1579,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -726,6 +726,7 @@
<span class="s2">&quot;Eagle&quot;</span><span class="p">:</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span>
<span class="s2">&quot;Lookahead&quot;</span><span class="p">:</span> <span class="n">LookaheadDecodingConfig</span><span class="p">,</span>
<span class="s2">&quot;NGram&quot;</span><span class="p">:</span> <span class="n">NGramDecodingConfig</span><span class="p">,</span>
<span class="s2">&quot;DraftTarget&quot;</span><span class="p">:</span> <span class="n">DraftTargetDecodingConfig</span><span class="p">,</span>
<span class="p">}</span>
<span class="n">config_class</span> <span class="o">=</span> <span class="n">config_classes</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">decoding_type</span><span class="p">)</span>
@ -765,7 +766,7 @@
<span class="n">dynamic_tree_max_topK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">num_eagle_layers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">max_non_leaves_per_layer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">pytorch_eagle_weights_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">pytorch_weights_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">eagle3_one_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
<div class="viewcode-block" id="EagleDecodingConfig.from_dict">
@ -819,6 +820,22 @@
<div class="viewcode-block" id="DraftTargetDecodingConfig">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.DraftTargetDecodingConfig">[docs]</a>
<span class="k">class</span><span class="w"> </span><span class="nc">DraftTargetDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
<span class="n">pytorch_weights_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<div class="viewcode-block" id="DraftTargetDecodingConfig.from_dict">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.DraftTargetDecodingConfig.from_dict">[docs]</a>
<span class="nd">@classmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;DraftTarget&quot;</span></div>
<div class="viewcode-block" id="MTPDecodingConfig">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig">[docs]</a>
<span class="k">class</span><span class="w"> </span><span class="nc">MTPDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
@ -826,6 +843,7 @@
<span class="n">use_relaxed_acceptance_for_thinking</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">relaxed_topk</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
<span class="n">relaxed_delta</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mf">0.</span>
<span class="n">use_mtp_vanilla</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>
<div class="viewcode-block" id="MTPDecodingConfig.from_dict">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig.from_dict">[docs]</a>
@ -1478,10 +1496,11 @@
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Cache transceiver config.&quot;</span><span class="p">)</span>
<span class="c1"># Speculative decoding parameters</span>
<span class="n">speculative_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span>
<span class="n">LookaheadDecodingConfig</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span>
<span class="n">MTPDecodingConfig</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Speculative decoding config.&quot;</span><span class="p">)</span>
<span class="n">speculative_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
<span class="n">Union</span><span class="p">[</span><span class="n">LookaheadDecodingConfig</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">,</span>
<span class="n">EagleDecodingConfig</span><span class="p">,</span> <span class="n">MTPDecodingConfig</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">,</span>
<span class="n">DraftTargetDecodingConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Speculative decoding config.&quot;</span><span class="p">)</span>
<span class="n">batching_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">BatchingType</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Batching type.&quot;</span><span class="p">)</span>
@ -1523,6 +1542,12 @@
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The parser to separate reasoning content from output.&quot;</span><span class="p">)</span>
<span class="n">garbage_collection_gen0_threshold</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="mi">20000</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;Threshold for Python garbage collection of generation 0 objects.&quot;</span>
<span class="s2">&quot;Lower values trigger more frequent garbage collection.&quot;</span><span class="p">)</span>
<span class="c1"># TODO[Superjomn]: To deprecate this config.</span>
<span class="n">decoding_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
@ -1878,7 +1903,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">Eagle3Config</span><span class="p">(</span>
<span class="n">max_draft_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span>
<span class="n">draft_model_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
<span class="n">pytorch_eagle_weights_path</span><span class="p">,</span>
<span class="n">pytorch_weights_path</span><span class="p">,</span>
<span class="n">eagle3_one_model</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
<span class="n">eagle3_one_model</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">):</span>
@ -1896,6 +1921,16 @@
<span class="n">is_use_oldest</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_use_oldest</span><span class="p">,</span>
<span class="n">is_public_pool</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_public_pool</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">DraftTargetDecodingConfig</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">DRAFT_TOKENS_EXTERNAL</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;pytorch&#39;</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">&gt;</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">DraftTargetConfig</span>
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">DraftTargetConfig</span><span class="p">(</span>
<span class="n">max_draft_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span>
<span class="n">draft_model_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
<span class="n">pytorch_weights_path</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MTPDecodingConfig</span><span class="p">):</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">MTPConfig</span>
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">MTPConfig</span><span class="p">(</span>
@ -1905,7 +1940,8 @@
<span class="n">use_relaxed_acceptance_for_thinking</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
<span class="n">use_relaxed_acceptance_for_thinking</span><span class="p">,</span>
<span class="n">relaxed_topk</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">relaxed_topk</span><span class="p">,</span>
<span class="n">relaxed_delta</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">relaxed_delta</span><span class="p">)</span>
<span class="n">relaxed_delta</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">relaxed_delta</span><span class="p">,</span>
<span class="n">use_mtp_vanilla</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">use_mtp_vanilla</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Speculative config type not recognized: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="si">}</span><span class="s2">&quot;</span>
@ -2160,12 +2196,6 @@
<span class="n">LlmArgs</span> <span class="o">=</span> <span class="n">TrtLlmArgs</span>
<span class="n">LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">=</span> <span class="n">generate_api_docs_as_docstring</span><span class="p">(</span><span class="n">LlmArgs</span><span class="p">,</span>
<span class="n">indent</span><span class="o">=</span><span class="s1">&#39; &#39;</span> <span class="o">*</span> <span class="mi">4</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">LoadFormat</span><span class="p">(</span><span class="n">Enum</span><span class="p">):</span>
<span class="n">AUTO</span> <span class="o">=</span> <span class="mi">0</span>
<span class="c1"># Initialize all weights randomly.</span>
@ -2178,18 +2208,18 @@
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Configuration for torch.compile.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">torch_compile_fullgraph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">enable_fullgraph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable full graph compilation in torch.compile.&quot;</span><span class="p">)</span>
<span class="n">torch_compile_inductor_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">enable_inductor</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable inductor backend in torch.compile.&quot;</span><span class="p">)</span>
<span class="n">torch_compile_piecewise_cuda_graph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">enable_piecewise_cuda_graph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable piecewise CUDA graph in torch.compile.&quot;</span><span class="p">)</span>
<span class="n">torch_compile_enable_userbuffers</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">enable_userbuffers</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;When torch compile is enabled, userbuffers is enabled by default.&quot;</span><span class="p">)</span></div>
@ -2240,7 +2270,10 @@
<span class="n">moe_load_balancer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">object</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Configuration for MoE load balancing.&quot;</span><span class="p">,</span>
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="s2">&quot;Union[MoeLoadBalancerConfig, str]&quot;</span><span class="p">})</span>
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;type&quot;</span><span class="p">:</span>
<span class="s2">&quot;Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]&quot;</span>
<span class="p">})</span>
<span class="n">attn_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">&#39;TRTLLM&#39;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Attention backend to use.&quot;</span><span class="p">)</span>
@ -2297,6 +2330,14 @@
<span class="s2">&quot;If true, enable min-latency mode. Currently only used for Llama4.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># TODO: make this a per-request parameter</span>
<span class="n">stream_interval</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;The iteration interval to create responses under the streaming mode. &quot;</span>
<span class="s2">&quot;Set this to a larger value when the batch size is large, which helps reduce the streaming overhead.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># TODO: remove backend later</span>
<div class="viewcode-block" id="TorchLlmArgs.init_backend">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.init_backend">[docs]</a>
@ -2358,6 +2399,16 @@
<span class="k">return</span> <span class="bp">self</span></div>
<div class="viewcode-block" id="TorchLlmArgs.validate_stream_interval">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_stream_interval">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_stream_interval</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">stream_interval</span> <span class="o">&lt;=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;stream_interval must be positive, got </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">stream_interval</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span></div>
<span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
<div class="viewcode-block" id="TorchLlmArgs.get_pytorch_backend_config">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config">[docs]</a>
@ -2382,22 +2433,21 @@
<span class="n">enable_iter_req_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_req_stats</span><span class="p">,</span>
<span class="n">print_iter_log</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">print_iter_log</span><span class="p">,</span>
<span class="n">torch_compile_enabled</span><span class="o">=</span><span class="nb">bool</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">),</span>
<span class="n">torch_compile_fullgraph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span><span class="o">.</span>
<span class="n">torch_compile_fullgraph</span>
<span class="n">torch_compile_fullgraph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span><span class="o">.</span><span class="n">enable_fullgraph</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">torch_compile_inductor_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span><span class="o">.</span>
<span class="n">torch_compile_inductor_enabled</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">enable_inductor</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">torch_compile_piecewise_cuda_graph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span><span class="o">.</span>
<span class="n">torch_compile_piecewise_cuda_graph</span>
<span class="n">enable_piecewise_cuda_graph</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">torch_compile_enable_userbuffers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span><span class="o">.</span>
<span class="n">torch_compile_enable_userbuffers</span>
<span class="n">enable_userbuffers</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">autotuner_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">autotuner_enabled</span><span class="p">,</span>
<span class="n">enable_layerwise_nvtx_marker</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_layerwise_nvtx_marker</span><span class="p">,</span>
<span class="n">load_format</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">load_format</span><span class="p">,</span>
<span class="n">enable_min_latency</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_min_latency</span><span class="p">)</span></div>
<span class="n">enable_min_latency</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_min_latency</span><span class="p">,</span>
<span class="n">stream_interval</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">stream_interval</span><span class="p">)</span></div>
<div class="viewcode-block" id="TorchLlmArgs.validate_cuda_graph_max_batch_size">
@ -2661,6 +2711,15 @@
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">model_format</span>
<span class="n">LlmArgs</span> <span class="o">=</span> <span class="n">TorchLlmArgs</span>
<span class="n">TRT_LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">=</span> <span class="n">generate_api_docs_as_docstring</span><span class="p">(</span><span class="n">TrtLlmArgs</span><span class="p">,</span>
<span class="n">indent</span><span class="o">=</span><span class="s1">&#39; &#39;</span> <span class="o">*</span> <span class="mi">4</span><span class="p">)</span>
<span class="n">TORCH_LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">=</span> <span class="n">generate_api_docs_as_docstring</span><span class="p">(</span><span class="n">TorchLlmArgs</span><span class="p">,</span>
<span class="n">indent</span><span class="o">=</span><span class="s1">&#39; &#39;</span> <span class="o">*</span>
<span class="mi">4</span><span class="p">)</span>
</pre></div>
</article>
@ -2772,9 +2831,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -805,6 +805,7 @@
<span class="sa">f</span><span class="s2">&quot;RemoteMpiCommSessionClient connecting to </span><span class="si">{</span><span class="n">addr</span><span class="si">}</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span> <span class="s2">&quot;yellow&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="n">ZeroMqQueue</span><span class="p">((</span><span class="n">addr</span><span class="p">,</span> <span class="n">hmac_key</span><span class="p">),</span>
<span class="n">is_server</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">socket_type</span><span class="o">=</span><span class="n">zmq</span><span class="o">.</span><span class="n">PAIR</span><span class="p">,</span>
<span class="n">use_hmac_encryption</span><span class="o">=</span><span class="nb">bool</span><span class="p">(</span><span class="n">hmac_key</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_is_shutdown</span> <span class="o">=</span> <span class="kc">False</span>
@ -860,23 +861,10 @@
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="k">def</span><span class="w"> </span><span class="nf">shutdown</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">wait</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_is_shutdown</span><span class="p">:</span>
<span class="k">return</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">print_colored_debug</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;RemoteMpiCommSessionClient [rank</span><span class="si">{</span><span class="n">global_mpi_rank</span><span class="p">()</span><span class="si">}</span><span class="s2">] send shutdown signal to server</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span>
<span class="s2">&quot;green&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">put</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> <span class="c1"># ask RemoteMpiCommSessionServer to shutdown</span>
<span class="k">except</span> <span class="n">zmq</span><span class="o">.</span><span class="n">error</span><span class="o">.</span><span class="n">ZMQError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="n">print_colored_debug</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Error during RemoteMpiCommSessionClient shutdown: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span>
<span class="s2">&quot;red&quot;</span><span class="p">)</span>
<span class="k">finally</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_is_shutdown</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">pass</span>
<span class="k">def</span><span class="w"> </span><span class="nf">shutdown_abort</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">grace</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mi">60</span><span class="p">,</span> <span class="n">reason</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
<span class="k">pass</span>
<span class="k">class</span><span class="w"> </span><span class="nc">RemoteMpiCommSessionServer</span><span class="p">():</span>
@ -895,6 +883,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">addr</span> <span class="o">=</span> <span class="n">addr</span>
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="n">ZeroMqQueue</span><span class="p">((</span><span class="n">addr</span><span class="p">,</span> <span class="n">hmac_key</span><span class="p">),</span>
<span class="n">is_server</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">socket_type</span><span class="o">=</span><span class="n">zmq</span><span class="o">.</span><span class="n">PAIR</span><span class="p">,</span>
<span class="n">use_hmac_encryption</span><span class="o">=</span><span class="nb">bool</span><span class="p">(</span><span class="n">hmac_key</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">comm</span> <span class="o">=</span> <span class="n">comm</span>
<span class="bp">self</span><span class="o">.</span><span class="n">results</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1"># the results may arrive in any order</span>
@ -976,7 +965,15 @@
<span class="n">print_colored_debug</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;RemoteMpiCommSessionServer received all results, sending to client</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span>
<span class="s2">&quot;green&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">put</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">results</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">put_noblock</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">results</span><span class="p">)</span>
<span class="k">except</span> <span class="n">zmq</span><span class="o">.</span><span class="n">ZMQError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="c1"># The client could be shutdown first.</span>
<span class="k">if</span> <span class="n">e</span><span class="o">.</span><span class="n">errno</span> <span class="o">==</span> <span class="n">zmq</span><span class="o">.</span><span class="n">EAGAIN</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">e</span>
<span class="n">print_colored_debug</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;RemoteMpiCommSessionServer sent results to client</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span> <span class="s2">&quot;green&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">results</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
@ -1152,9 +1149,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -881,9 +881,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1185,9 +1185,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -793,9 +793,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -810,9 +810,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1009,9 +1009,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -838,9 +838,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -669,9 +669,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -922,9 +922,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -820,9 +820,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -684,9 +684,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -810,9 +810,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -904,9 +904,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -986,9 +986,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1022,9 +1022,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1958,9 +1958,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -2863,9 +2863,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -745,9 +745,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -907,9 +907,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -835,9 +835,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1027,9 +1027,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -954,9 +954,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1057,9 +1057,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -683,9 +683,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -835,9 +835,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -775,9 +775,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -909,9 +909,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1257,9 +1257,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1102,9 +1102,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -742,9 +742,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -892,9 +892,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -2203,9 +2203,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1269,9 +1269,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -2664,9 +2664,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -807,9 +807,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -741,9 +741,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -809,9 +809,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -812,9 +812,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -856,9 +856,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -952,9 +952,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1255,9 +1255,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -927,9 +927,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1483,9 +1483,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1051,9 +1051,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1898,9 +1898,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1166,9 +1166,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -5454,9 +5454,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1113,9 +1113,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1625,9 +1625,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1453,13 +1453,17 @@
<span class="n">output_ids</span> <span class="o">=</span> <span class="p">[[[]</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_sequences</span><span class="p">)]</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">request_ids</span><span class="p">))]</span>
<span class="n">multi_responses</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">await_responses</span><span class="p">(</span><span class="n">request_ids</span><span class="p">)</span>
<span class="n">responses</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">response</span> <span class="k">for</span> <span class="n">responses</span> <span class="ow">in</span> <span class="n">multi_responses</span> <span class="k">for</span> <span class="n">response</span> <span class="ow">in</span> <span class="n">responses</span>
<span class="p">]</span>
<span class="n">all_responses</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">finished_request_ids</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">while</span> <span class="n">finished_request_ids</span> <span class="o">!=</span> <span class="nb">set</span><span class="p">(</span><span class="n">request_ids</span><span class="p">):</span>
<span class="n">responses</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">await_responses</span><span class="p">()</span>
<span class="k">for</span> <span class="n">response</span> <span class="ow">in</span> <span class="n">responses</span><span class="p">:</span>
<span class="k">if</span> <span class="n">response</span><span class="o">.</span><span class="n">result</span><span class="o">.</span><span class="n">is_final</span><span class="p">:</span>
<span class="n">finished_request_ids</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">response</span><span class="o">.</span><span class="n">request_id</span><span class="p">)</span>
<span class="n">all_responses</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">responses</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_fill_output</span><span class="p">(</span>
<span class="n">responses</span><span class="o">=</span><span class="n">responses</span><span class="p">,</span>
<span class="n">responses</span><span class="o">=</span><span class="n">all_responses</span><span class="p">,</span>
<span class="n">output_ids</span><span class="o">=</span><span class="n">output_ids</span><span class="p">,</span>
<span class="n">end_id</span><span class="o">=</span><span class="n">end_id</span><span class="p">,</span>
<span class="n">return_dict</span><span class="o">=</span><span class="n">return_dict</span><span class="p">,</span>
@ -1831,9 +1835,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -3408,9 +3408,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -973,9 +973,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1099,9 +1099,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -18,7 +18,10 @@ In this blog, we share the configurations and procedures about how to reproduce
- [Reproducing steps](#reproducing-steps)
- [B200 min-latency](#b200-min-latency)
- [Expected Results](#expected-results)
- [B200 max-throughput](#b200-max-throughput)
- [B200 max-throughput with FP8 KV](#b200-max-throughput-for-r1-0528-with-fp8-kv-cache)
- [Benchmark](#benchmark)
- [Expected Result Format](#expected-result-format)
- [B200 max-throughput with FP16 KV](#b200-max-throughput-for-r1-with-fp16-kv-cache)
- [Benchmark](#benchmark)
- [Expected Result Format](#expected-result-format)
- [H200 min-latency](#h200-min-latency)
@ -181,9 +184,68 @@ Total Token Throughput (tokens/sec): 414.0461
Total Latency (ms): 74561.7520
Average request latency (ms): 7456.1219
```
### B200 max-throughput for R1-0528 with FP8 KV cache
### B200 max-throughput
Our benchmark results are based on **Batch = 3072, ISL = 1K, OSL = 2K, num_requests = 49152 from synthetic dataset**
Due to our evaluation found that FP8 KV cache does not introduce obvious accuracy drop compared to BF16 KV cache. See [Precision strategy](./tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md#precision-strategy), the latest [DeepSeek-R1-0528-FP4](https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4) checkpoint had enabled FP8 KV cache by-default.
We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers here. The results are reproduced with TensorRT-LLM commit b6261862419c33d6ce2313aff1e7116067d6037d.
!! Note that the exact command to reproduce numbers can change as the API/options are refactored, the option and numbers here is a reference at given exact commit.
#### Benchmark
```bash
cat >./extra-llm-api-config.yml <<EOF
pytorch_backend_config:
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 896
- 512
- 256
- 128
- 64
- 32
- 16
- 8
- 4
- 2
- 1
print_iter_log: true
kv_cache_dtype: fp8
enable_attention_dp: true
EOF
trtllm-bench --model nvidia/DeepSeek-R1-0528-FP4
throughput
--dataset ${YOUR_DATA_PATH}
--backend pytorch
--tp 8 --ep 8
--extra_llm_api_options ./extra-llm-api-config.yml
--max_batch_size 896
--max_num_tokens 2048
--kv_cache_free_gpu_mem_fraction 0.93
--concurrency 7168
--num_requests 114688
```
#### Expected Result Format
```
===========================================================
= PERFORMANCE OVERVIEW
===========================================================
Request Throughput (req/sec): 21.0675
Total Output Throughput (tokens/sec): 43146.2042
Total Token Throughput (tokens/sec): 65100.6376
Total Latency (ms): 5443839.8140
Average request latency (ms): 332826.9898
Per User Output Throughput [w/ ctx] (tps/user): 6.1806
Per GPU Output Throughput (tps/gpu): 5393.2755
```
### B200 max-throughput for R1 with FP16 KV cache
Our benchmark results are based on **Batch = 3072, ISL = 1K, OSL = 2K, num_requests = 49152 from synthetic dataset**.
The results are reproduced with TensorRT-LLM commit b6261862419c33d6ce2313aff1e7116067d6037d.
!! Note that the exact command to reproduce numbers can change as the API/options are refactored, the option and numbers here is a reference at given exact commit.
#### Benchmark
To do the benchmark, run the following command:
@ -201,20 +263,21 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
YOUR_DATA_PATH=./dataset.txt
cat >./extra-llm-api-config.yml <<EOF
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 384
print_iter_log: true
pytorch_backend_config:
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 384
print_iter_log: ${PRINT_ITER_LOG}
enable_attention_dp: true
EOF
@ -239,12 +302,13 @@ The perf might be different from different datasets and machines
===========================================================
= PERFORMANCE OVERVIEW
===========================================================
Request Throughput (req/sec): 17.3885
Total Output Throughput (tokens/sec): 35611.5942
Per User Output Throughput (tokens/sec/user): 11.6701
Per GPU Output Throughput (tokens/sec/gpu): 4451.4493
Total Latency (ms): 2826700.0758
Average request latency (ms): 176064.1921
Request Throughput (req/sec): 17.7657
Total Output Throughput (tokens/sec): 36384.0838
Total Token Throughput (tokens/sec): 54576.1257
Total Latency (ms): 2766684.9197
Average request latency (ms): 172321.7206
Per User Output Throughput [w/ ctx] (tps/user): 11.9263
Per GPU Output Throughput (tps/gpu): 4548.0105
```
### H200 min-latency

View File

@ -55,7 +55,7 @@ For the draft stage in MTP, there are two different MTP methods, MTP vanilla and
MTP Vanilla method is more similar to the MTP training, and it sequentially uses different MTP modules to predict multiple draft tokens. This method can support model checkpoints with weights of multiple different MTP modules. And each MTP module will have its own KV cache.
Figure 2 illustrates the MTP vanilla inference. In the context phase, assuming there are a total of four input tokens, we will get the output token $t_5$ and the hidden states after the main model forward. The output token will be appended to the input tokens, then we shift out the first token to get tokens from $t_2$ to $t_5$ as the input tokens of the first MTP module. The hidden states from the main model will be directly used as the input of the first MTP module to predict the first draft token. For the next several MTP modules, we will use the same method to prepare the inputs to predict the sequential draft tokens.
Figure 2 illustrates the MTP vanilla inference. In the context phase, assuming there are a total of four input tokens, we will get the output token $t_5$ and the hidden states after the main model forward. The output token will be appended to the input tokens, then we shift out the first token to get tokens from $t_2$ to $t_5$ as the input tokens of the first MTP module. The hidden states from the main model will be directly used as the input of the first MTP module to predict the first draft token. For the next few MTP modules, we'll append the newly generated draft token and the hidden states corresponding to the last input token to the input tokens and hidden states. Then we'll shift out the first token to prepare the inputs for the next MTP module. In this way, we can retain as much information as possible from the main model, which helps the draft layer make more accurate predictions.
In the generation phase, there will be a little difference. The predicted token $t_5$ and the draft tokens will be used as inputs for the main model. After the main model forward, we will do the verification to get the accepted tokens. In this example, assuming $j$ draft tokens $d_6$~$d_{j+5}$ are accepted. Then prepare the MTP module inputs. Different from the context phase, we will prepare input IDs and hidden states of a total of $K$ tokens before the last accepted token. In this example, the last accepted token is $t_{j+6}$. Then we can get the first draft token after the first MTP module forward. For the sequential MTP modules, we can prepare their inputs in a similar way to the MTP modules in the context phase, so all of those MTP modules have the same input sequence length. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model's KV cache to ensure the subsequent calculation is correct.
@ -72,7 +72,7 @@ MTP Eagle can be viewed as a variant of [Eagle](https://arxiv.org/pdf/2401.15077
Figure 3 gives an MTP Eagle example. In the context phase, the inputs of the first MTP module forward are the same as the MTP Vanilla. However, for the sequential MTP module forward, the first difference is that MTP Eagle uses the same MTP module to predict draft tokens and reuses the same KV cache. Another difference is that we only need to input the token ID and the hidden state of one token. The token is the last predicted draft token, while the hidden state is the corresponding hidden state in the last MTP module forward. In this way, we can predict total K draft tokens by using only one MTP module.
In the generation phase, the verification stage is the same as MTP Vanilla. After getting the accepted tokens, we will use the last accepted tokens and the corresponding hidden state as the inputs of the first MTP module forward. Compared with MTP Vanilla, it will be much easier to implement. And the sequential MTP module forwards use the same method as the context phase to prepare inputs. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model's KV cache.
In the generation phase, the verification stage is the same as MTP Vanilla. Once we get the accepted tokens, we use all of them along with their corresponding hidden states as inputs for the first MTP module forward. Unlike MTP Vanilla, which needs to store past tokens and hidden states, this approach is much easier to implement. Subsequent MTP module forwards follow the same input preparation method as the context phase. After predicting all draft tokens, we need to evict the key/value pairs of any rejected draft tokens from the main models KV cache.
## MTP implementation in TensorRT-LLM
### Basic Implementation
@ -241,14 +241,6 @@ TensorRT-LLM PyTorch backend can only support chain-based speculative decoding n
Another important method is Eagle3. From the [Eagle3 paper](https://arxiv.org/pdf/2503.01840), the promising results show that it can help greatly increase the acceptance rate by leveraging different levels hidden states to predict draft tokens. Since TensorRT-LLM already has [Eagle-3 support](https://github.com/NVIDIA/TensorRT-LLM/pull/3035) now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup.
### Fix known issues
There are still some known issues, and we will fix them soon:
- The MTP vanilla path has a known accuracy issue. We will fix it and refactor the MTP vanilla implementation.
- The MTP Eagle is non-deterministic now.
- An accuracy issue when enabling MTP and attention DP together.
## Acknowledgment
This was a remarkable cross-team effort to support and optimize MTP in TensorRT-LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement & analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog.

View File

@ -0,0 +1,278 @@
# Disaggregated Serving in TensorRT-LLM
By NVIDIA TensorRT-LLM Team
- [Disaggregated Serving in TensorRT-LLM](#Disaggregated-Serving-in-TensorRT-LLM)
- [Motivation](#Motivation)
- [Disaggregated Serving in TensorRT-LLM](#Disaggregated-Serving-in-TensorRT-LLM)
- [trtllm-serve](#trtllm-serve)
- [Dynamo](#Dynamo)
- [Triton Inference Server](#Triton-Inference-Server)
- [KV Cache Exchange](#KV-Cache-Exchange)
- [Multi-backend Support](#Multi-backend-Support)
- [Overlap Optimization](#Overlap-Optimization)
- [Cache Layout Transformation](#Cache-Layout-Transformation)
- [Performance Studies](#Performance-Studies)
- [Measurement Methodology](#Measurement-Methodology)
- [DeepSeek R1](#DeepSeek-R1)
- [ISL 4400 - OSL 1200 (Machine Translation Dataset)](#ISL-4400---OSL-1200-Machine-Translation-Dataset)
- [ISL 8192 - OSL 256 (Synthetic Dataset)](#ISL-8192---OSL-256-Synthetic-Dataset)
- [ISL 4096 - OSL 1024 (Machine Translation Dataset)](#ISL-4096---OSL-1024-Machine-Translation-Dataset)
- [Reproducing Steps](#Reproducing-Steps)
- [Future Work](#Future-Work)
- [Acknowledgement](#Acknowledgement)
In the past tech blogs, we have introduced optimization specifically for [low-latency](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) and [throughput](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md) oriented optimizations. For production deployment, users also care about per GPU throughput satisfying certain latency constraints. In this tech blog, we will introduce the design concept and usage of the TensorRT-LLM disaggregated serving which directly targets throughput@latency performance scenarios, together with performance study results.
## Motivation
LLM inference has two stages: context (prefill) and generation (decode) phases. The context phase computes KV cache for prompt tokens whereas the generation phase generates tokens one by one using cached values. These phases have different compute characteristics.
There are two ways of serving LLM inference requests:
* Aggregated LLM serving (sometimes it is also called IFB in this tech blog), in which the context and generation phases are run on the same GPU.
* Disaggregated LLM serving, in which the context and generation phases are run on different GPUs.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture1.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 1. The execution timeline of aggregated LLM serving</em></sub></p>
In aggregated LLM serving, both the context and generation phases share the same GPU resources and parallelism strategy. This can lead to interference where context processing delays token generation, increasing token-to-token latency (TPOT) and reducing interactivity. This is illustrated in Figure 1 which shows the execution timeline for aggregated LLM serving. Aggregated LLM serving also forces a single GPU type and parallelism configuration for both phases, even though their compute needs differ. As a result, optimizing for one metric such as time-to-first-token (TTFT), often comes at the expense of another metric such as TPOT.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture2.png" width="580" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 2. The execution timeline of dis-aggregated LLM serving</em></sub></p>
Disaggregated serving resolves these challenges by decoupling the two phases, allowing each to run on separate GPU pools and using different parallelism strategies. This separation removes the interference between context and generation phases, as shown in Figure 2, and enables independent optimization of TTFT and TPOT. Although disaggregation incurs overhead for transferring the KV cache blocks from context to generation GPUs, the advantages can be substantial—particularly for workloads with long input sequences and moderate output lengths where interference is most severe.
You can also refer to [this paper](https://arxiv.org/pdf/2506.05508) for more details about the rational and design considerations of disaggregated serving.
## Disaggregated Serving in TensorRT-LLM
There are three different approaches to do disaggregation LLM inference with TensorRT-LLM, where each approach offers distinct architectural and operational characteristics suited to different deployment scenarios.
### trtllm-serve
[`trtllm-serve`](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html) is a command-line utility that facilitates the deployment of an OpenAI-compatible server for TensorRT-LLM instances.
The first approach to do disaggregated LLM inference with TensorRT-LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 3 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 3). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture3.png" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 3. `trtllm-serve` integration with disaggregated service</em></sub></p>
In the example below, two context servers are launched on ports 8001 and 8002, and two generation servers are launched on ports 8003 and 8004. Finally, a disaggregated server is also launched using `trtllm-serve`. The disaggregated server will receive client requests on port 8000, and do the orchestration between the context and generation servers.
```shell
# Launching context servers
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_ctx0 &
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_ctx1 &
# Launching generation servers
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_gen0 &
trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8004 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch &> output_gen1 &
# Launching disaggregated server
trtllm-serve disaggregated -c disagg_config.yaml
```
```yaml
# disagg_config.yaml
hostname: localhost
port: 8000
context_servers:
num_instances: 2
router:
type: round_robin
urls:
- "localhost:8001"
- "localhost:8002"
generation_servers:
num_instances: 2
urls:
- "localhost:8003"
- "localhost:8004"
```
The disaggregated server supports various load balancing strategies, including round-robin and KV cache-aware routing. Although it currently supports a fixed number of context and generation instances, the architecture is designed to be extensible, and efforts are underway to enable dynamic scaling.
For more information on this approach to do disaggregated serving, please refer to [the example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/disaggregated#trt-llm-disaggregated-serving).
### Dynamo
The second approach involves the use of [Dynamo](https://github.com/ai-dynamo/dynamo), a data center-scale inference server developed specifically for LLM workloads. Dynamo introduces several advanced features not present in the other methods, including decoupled pre- and post-processing workers, which are particularly beneficial under high concurrency conditions. The disaggregated LLM inference workflow with Dynamo is illustrated in Figure 4.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture4.png" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 4. Dynamo integration with disaggregated service</em></sub></p>
In the Dynamo workflow, requests are initially processed by pre- and post-processing workers, which then query a smart router to determine the optimal decode worker to route the requests to. Depending on the availability of KV cache blocks, the decoder worker may bypass the prefill stage or forward the request to the prefill worker. Once the prefill worker is done processing the prompt, the KV cache blocks can be sent from the prefill worker to the decoder worker, using the metadata referred to as ctx_params in the figure above.
Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments.
For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html).
### Triton Inference Server
The third approach to do disaggregated LLM inference with TensorRT-LLM utilizes the Triton Inference Server. With this approach a Triton ensemble model is employed, comprising a preprocessor, an orchestrator implemented as [a Python business logic scripting (BLS) backend](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/bls.html), and a post-processor. The orchestrator is responsible for routing client requests to context and generation instances, managing the flow of prompt tokens, and handling the return of generated tokens. This approach is illustrated in Figure 5. The Triton Inference Server approach relies on the Triton TensorRT-LLM backend and the Executor API, which is supported only for the TensorRT backend. For more information on how to use this approach, please refer to [this documentation](https://github.com/NVIDIA/TensorRT-LLM/tree/main/triton_backend/all_models/disaggregated_serving#running-disaggregated-serving-with-triton-tensorrt-llm-backend).
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture5.png" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 5. Triton integration with disaggregated service</em></sub></p>
## KV Cache Exchange
### Multi-backend Support
In TensorRT-LLM, the KV cache exchange is modularly decoupled from the KV cache manager and the underlying communication libraries, as shown in Figure 6. The KV cache exchange module is responsible for efficient transmission and reception of the cache, promptly releasing cache space, and performing cache layout conversions during the exchange process. Currently, mainstream communication protocols—MPI, UCX, and NIXL—are all supported by TensorRT-LLM, and the underlying communication protocols utilize RDMA / NVLink. Currently, we recommend using UCX and NIXL backends, as we are adding a dynamic scaling mechanism on top of them—specifically, dynamic node joining and leaving. This allows customers to adjust the load based on traffic demands or switch roles between context and generation dynamically.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture6.png" width="890" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 6. KV cache exchange architecture</em></sub></p>
### Overlap Optimization
To optimize the overall performance of disaggregated serving, TensorRT-LLM overlaps the KV cache transmission with computation for multiple independent requests. While one request is sending or receiving its KV cache blocks, other requests can proceed with computation, as illustrated in Figure 7. Furthermore, if context and generation instances are using multiple GPUs per instance, KV cache transmission between different sets of GPUs can occur in parallel.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture7.png" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 7. KV cache exchange timing diagram</em></sub></p>
### Cache Layout Transformation
To minimize KV cache transmission latency, TensorRT-LLM currently uses direct transmission between device memories for cache transfer. The KV cache transmission supports using different parallel strategies for the context and generation phases. In such cases, careful orchestration of KV cache block mapping is required. Figure 8 illustrates this using the example of context phase with TP2 and generation phase with PP2.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture8.png" width="680" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 8. KV cache layout conversion</em></sub></p>
The optimizations required for KV cache transmission vary depending on whether it's single-node multi-GPU, multi-node multi-GPU, or different GPU models. To accommodate this, TensorRT-LLM provides a set of environment variables for selection in different environments. Please refer to [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) for details.
## Performance Studies
### Measurement Methodology
Generating a performance curve for disaggregated LLM serving requires an exhaustive sweep across all parallelization strategies. This includes combinations of TP/EP/DP/PP and other optimizations like speculative decoding (such as MTP). These combinations must be evaluated separately for context and generation stages. As the number of context (CTX) and generation (GEN) servers increases, the number of possible configurations grows exponentially.
To identify optimal configurations, a two step process is used:
* Rate Matching
* Measure request throughput (request/s/GPU) for context servers for different TP/EP/DP/PP mapping that meet the TTFT constraint, choose the most efficient configuration.
* Measure total throughput (tok/s) and latency (tok/s/user) for generation servers from different TP/EP/DP/PP mappings, concurrency levels and speculative decoding turned on/off.
* Find the ratio of context to generation workers such that aggregated throughput of context servers matches the aggregated throughput of generation servers for the workloads input sequence length (ISL) and output sequence length (OSL)
* Calculate the throughput per GPU using the formula:
$\frac{\text{Total Output Tokens/sec}}{\left(\frac{\text{NumCtxGPUs} \times \text{GenReqRate}}{\text{CtxReqRate}}\right) + \text{NumGenGPUs}}$
* Once the ideal ratio of context to generation servers is computed, the “rate-matched” Pareto curve can be constructed to identify the best configuration to use at different latencies (tok/s/user)
* E2E measurement
* Benchmark `trtllm-serve` disaggregated setups for the most promising configurations taking into account practical limits in terms of total number of GPUs available.
### DeepSeek R1
We conducted performance testing on DeepSeek R1 based on datasets with different ISLs and OSLs. All experiments below were conducted on GB200 GPUs.
#### ISL 4400 - OSL 1200 (Machine Translation Dataset)
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture9.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 9. “Rate-matched” Pareto curve for DeepSeek R1 without MTP</em></sub></p>
Figure 9 shows the rate-matched Pareto curve for DeepSeek R1 with MTP off. Configurations with attention DP and attention TP were considered, with 4, 8, 16 or 32 GPUs per instance. The speedups obtained with disaggregation range from **1.4x** to **1.8x**, especially at lower concurrency levels.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture10.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 10. DeepSeek R1 with MTP Pareto curve</em></sub></p>
For some data points on the performance curve, the context/generation instance number is shown with the corresponding parallelism mapping employed for each instance. For example, `CTX=1xTEP-4|GEN=2xDEP-8` means 1 TEP4 context instance and 2 DEP8 generation instances constitute a full LLM serving instance.
As shown in Figure 10, enabling MTP increases speedups of disaggregation over aggregation further, reaching 1.6x to 2.5x, averaging 20 30 % higher than MTP-off.
#### ISL 8192 - OSL 256 (Synthetic Dataset)
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture11.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 11. DeepSeek R1 4-GPU Pareto curve. ctx/gen=4.5 means SOL rate matching between context and generation phase, which is only used for SOL perf result collection purpose. c4dep4_g1dep4 means 4 DEP4 context instances plus 1 DEP4 generation instance form a full LLM serving instance.</em></sub></p>
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture12.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 12. DeepSeek R1 8-GPU Pareto curve</em></sub></p>
Figures 11 and 12 show the performance curves for the ISL8192-OSL256 dataset on DeepSeek R1 using 4 GPUs per generation instance (GEN4) and 8 GPUs per generation instance (GEN8) respectively. With disaggregation, we plot both “rate-matched” results (based on perfect rate matching between context and generation phases) and E2E results (which can be directly reproduced by users in production deployment environments).
The results show that for this ISL/OSL setting, disaggregated serving outperforms aggregated serving significantly—achieving up to **1.73x** speedup with GEN4 and up to **2x** with GEN8.
By comparing the disaggregated serving E2E results with the “rate-matched” curve, we observe a performance gap of 025%. This discrepancy is expected, as SOL performance relies on idealized assumptions—such as fractional ctx:gen ratios and the absence of KV cache transfer overhead.
#### ISL 4096 - OSL 1024 (Machine Translation Dataset)
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture13.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 13. DeepSeek R1 E2E Pareto curves with MTP = 1, 2, 3. In this figure, ctx1dep4-gen2dep4-mtp3 means 1 DEP4 context instance plus 2 DEP4 generation instances with MTP = 3.</em></sub></p>
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture14.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 14. DeepSeek R1 E2E Pareto curves without MTP.</em></sub></p>
In Figure 13 and 14, the E2E Pareto curves for aggregated serving and disaggregated serving, with and without MTP are shown.
For Pareto curves with MTP = 1, 2, 3, it can be observed that disaggregated results show a **1.7x** improvement over aggregated results at 50 tokens/sec/user (20 ms latency). Enabling MTP provides a larger speedup at higher concurrencies.
### Reproducing Steps
We provide a set of scripts to reproduce the performance data presented in this paper. Please refer to the usage instructions described in [this document](https://github.com/NVIDIA/TensorRT-LLM/tree/main/docs/source/scripts/disaggregated).
## Future Work
Although we can already demonstrate the performance benefits of doing disaggregated LLM inference with TensorRT-LLM, there is still work to be done to further improve the performance and ease of use. Among other things, we plan to:
* Provide detailed steps and scripts to automate the generation of throughput-latency performance curves comparing aggregated with disaggregated.
* Continue to improve performance at larger scales (large-scale EP for example).
* Support dynamic scaling of context and generation instances based on traffic load.
* Support overlapping KV cache communication and compute on a per-layer basis.
## Acknowledgement
Adding support for disaggregated serving in TensorRT-LLM is a typical one-team effort requiring close collaboration spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT-LLM. Through this collaborative endeavor, we have developed valuable insights to allow us to improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.

View File

@ -2,7 +2,7 @@
# Building from Source Code on Linux
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0, which uses the new CXX11 ABI.
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI.
## Prerequisites

View File

@ -5,7 +5,7 @@
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
```bash
pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
```

View File

@ -5,7 +5,7 @@
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
```bash
(Optional) pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
(Optional) pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
```

View File

@ -4,155 +4,192 @@ API Reference
.. autoclass:: tensorrt_llm.llmapi.LLM
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:inherited-members:
.. autoclass:: tensorrt_llm.llmapi.CompletionOutput
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.RequestOutput
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.GuidedDecodingParams
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.SamplingParams
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.DisaggregatedParams
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.KvCacheConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.KvCacheRetentionConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.LookaheadDecodingConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.MedusaDecodingConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.EagleDecodingConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.MTPDecodingConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.SchedulerConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.CapacitySchedulerPolicy
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.BuildConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.QuantConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.QuantAlgo
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.CalibConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.BuildCacheConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.RequestError
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.MpiCommSession
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.BatchingType
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.ContextChunkingPolicy
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.DynamicBatchConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.CacheTransceiverConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.NGramDecodingConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.TorchCompileConfig
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.DraftTargetDecodingConfig
:members:
:undoc-members:
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.LlmArgs
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.TorchLlmArgs
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs
:members:
:undoc-members:
:special-members: __init__
:show-inheritance:
:special-members: __init__

View File

@ -43,8 +43,8 @@ Unit tests live under `tests/unittest/` and run during the merge-request pipelin
`jenkins/L0_Test.groovy` maps stage names to these YAML files. For A100 the mapping includes:
```groovy
"A100X-Triton-Python-[Post-Merge]-1": ["a100x", "l0_a100", 1, 2],
"A100X-Triton-Python-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
"A100X-Triton-[Post-Merge]-1": ["a100x", "l0_a100", 1, 2],
"A100X-Triton-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
```
The array elements are: GPU type, YAML file (without extension), shard index, and total number of shards. Only tests with `stage: post_merge` from that YAML file are selected when a `Post-Merge` stage runs.
@ -57,12 +57,12 @@ The array elements are: GPU type, YAML file (without extension), shard index, an
### Example
`triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]` appears in `l0_a100.yml` under `stage: post_merge` and `backend: triton`. The corresponding Jenkins stages are `A100X-Triton-Python-[Post-Merge]-1` and `A100X-Triton-Python-[Post-Merge]-2` (two shards).
`triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]` appears in `l0_a100.yml` under `stage: post_merge` and `backend: triton`. The corresponding Jenkins stages are `A100X-Triton-[Post-Merge]-1` and `A100X-Triton-[Post-Merge]-2` (two shards).
To run the same tests on your pull request, comment:
```bash
/bot run --stage-list "A100X-Triton-Python-[Post-Merge]-1,A100X-Triton-Python-[Post-Merge]-2"
/bot run --stage-list "A100X-Triton-[Post-Merge]-1,A100X-Triton-[Post-Merge]-2"
```
This executes the same tests that run post-merge for this hardware/backend.

View File

@ -142,9 +142,9 @@ The following table shows the supported software for TensorRT-LLM.
* -
- Software Compatibility
* - Container
- [25.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
- [25.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
* - TensorRT
- [10.10](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
- [10.11](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
* - Precision
-
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4

View File

@ -0,0 +1,93 @@
# Disaggregated Inference Benchmark Scripts
This directory contains scripts to run disaggregated inference benchmarks using TensorRT-LLM and SLURM.
## Overview
The benchmarking process is orchestrated through a set of shell scripts and a Python script that work together:
1. `submit.sh`: The main entry point for submitting benchmark jobs to SLURM. It runs a parameter sweep by calling `sbatch` with different configurations.
2. `disaggr_torch.slurm`: The SLURM script that sets up and runs a single benchmark experiment. It launches a container, generates a configuration file, starts the server and workers, and runs the benchmark client.
3. `gen_yaml.py`: A Python script that generates the `config.yaml` file needed by `trtllm-serve`. It determines the server and worker configuration based on SLURM environment variables and script arguments.
4. `start_worker.sh`: A shell script responsible for starting a `trtllm-serve disaggregated_mpi_worker` on each allocated machine.
5. `run_benchmark.sh`: A shell script that waits for the server to be healthy and then runs the actual benchmark client (`run_benchmark.py`, not included in this directory).
## File Descriptions
### `submit.sh`
This script is used to submit multiple SLURM jobs for running benchmarks with different parameters. It iterates through various configurations and uses `sbatch` to submit `disaggr_torch.slurm` for each one.
**Usage:**
```bash
./submit.sh
```
You can modify the loops in this script to change the parameter space for the benchmark sweep.
### `disaggr_torch.slurm`
This is the core SLURM script for a single benchmark run. It is not meant to be run directly, but rather submitted via `sbatch` (e.g., by `submit.sh`).
It takes the following arguments in order:
1. `num_ctx_servers`: Number of context servers.
2. `ctx_tp_size`: Tensor parallel size for context servers.
3. `ctx_batch_size`: Max batch size for context servers.
4. `ctx_max_num_tokens`: Max number of tokens for context servers.
5. `ctx_enable_attention_dp`: `true` or `false` to enable attention DP for context servers.
6. `num_gen_servers`: Number of generation servers.
7. `gen_tp_size`: Tensor parallel size for generation servers.
8. `gen_batch_size`: Max batch size for generation servers.
9. `gen_max_num_tokens`: Max number of tokens for generation servers.
10. `gen_enable_attention_dp`: `true` or `false` to enable attention DP for generation servers.
11. `gen_gpu_memory_fraction`: GPU memory fraction for generation servers.
12. `concurrency_list`: A space-separated list of concurrencies to test (e.g., "1 2 4 8").
13. `sub_file`: A subdirectory name for logs.
### `gen_yaml.py`
This Python script generates the `config.yaml` file that configures the `trtllm-serve` application. It reads SLURM environment variables (`SLURM_JOB_NODELIST`, `SLURM_TASKS_PER_NODE`) to distribute workers across nodes.
**Usage:**
The script is called from within `disaggr_torch.slurm`. It takes numerous arguments to define the model, parallelism, and server configurations.
### `start_worker.sh`
This script starts a `trtllm-serve disaggregated_mpi_worker`. It is launched by `srun` from the `disaggr_torch.slurm` script on all allocated nodes.
**Arguments:**
1. `config_file`: Path to the `config.yaml` file.
2. `enable_pdl`: `true` or `false`.
3. `ctx_gpus`: Number of GPUs used for the context phase.
4. `work_dir`: (Optional) Directory to store nsys profiling output.
### `run_benchmark.sh`
This script orchestrates the execution of the benchmark client. It waits for the `config.yaml` to be created and for the server's `/health` endpoint to respond, then it runs the benchmark.
**Arguments:**
1. `isl`: Input sequence length.
2. `osl`: Output sequence length.
3. `multi_round`: Number of rounds for the benchmark.
4. `model_name`: Name of the model being benchmarked.
5. `concurrency_list`: Space-separated list of concurrencies.
6. `streaming`: `true` or `false`.
7. `log_path`: Path to the log directory.
## Workflow
1. The user runs `./submit.sh`.
2. `submit.sh` submits one or more jobs to SLURM by calling `sbatch disaggr_torch.slurm` with different parameters.
3. For each job, SLURM allocates resources and runs `disaggr_torch.slurm`.
4. `disaggr_torch.slurm` runs `gen_yaml.py` to create a `config.yaml`.
5. `disaggr_torch.slurm` uses `srun` to launch `start_worker.sh` on all nodes, starting the MPI workers.
6. `disaggr_torch.slurm` starts the main `trtllm-serve` process.
7. `disaggr_torch.slurm` runs `run_benchmark.sh` which waits for the server to be ready.
8. `run_benchmark.sh` executes the benchmark for each concurrency level specified.
9. After the benchmark, `run_benchmark.sh` and `disaggr_torch.slurm` attempt to kill the server and worker processes.
10. Logs for each run are stored in a subdirectory specified by the `sub_file` parameter.

View File

@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You
## Quick Start
Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.
```{literalinclude} ../../examples/pytorch/quickstart.py
:language: python
@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
llm.generate("Hello, my name is")
```
@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
enable_trtllm_sampler=True)
sampling_params = SamplingParams(

View File

@ -186,7 +186,7 @@ __all__ = [
Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
import modeling_mymodel
def main():

View File

@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d
## Top Level API
The interface for PyTorch backend is `tensorrt._torch.LLM`.
The interface for PyTorch backend is `tensorrt_llm.LLM`.
```python
from tensorrt_llm._torch import LLM
from tensorrt_llm import LLM
llm = LLM(model=<path_to_llama_from_hf>)
```

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -809,9 +809,9 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -814,9 +814,9 @@ the TensorRT-LLM C++ Executor API.</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -681,9 +681,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -999,9 +999,9 @@ is computed as:</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -1040,9 +1040,9 @@ The <code class="docutils literal notranslate"><span class="pre">GptDecoder</spa
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -860,9 +860,9 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -765,9 +765,9 @@ An “event” is any significant change in the lifecycle or state of a KV cache
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -738,9 +738,9 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -852,9 +852,9 @@ The shape of <code class="docutils literal notranslate"><span class="pre">LoraWe
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -61,7 +61,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -701,9 +701,9 @@ This feature is optimized for PCIe-based GPU topologies and may affect model acc
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc0';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -61,7 +61,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc2" />
<meta name="docsearch:version" content="1.0.0rc0" />
</head>
@ -656,9 +656,9 @@ Note that support for these static libraries will be gradually deprioritized in
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 16, 2025.</p>
<p>Last updated on June 21, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/8445416">8445416</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/ebadc13">ebadc13</a>.</p>
</div></div>

Some files were not shown because too many files have changed in this diff Show More