mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-28 14:44:24 +08:00
Update latest GitHub pages to v1.2.0rc5
This commit is contained in:
parent
a071059a8e
commit
0137c0e12a
@ -1,4 +1,4 @@
|
||||
# Sphinx build info version 1
|
||||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
|
||||
config: e877fa21f4c01def0efb8f650d34bf16
|
||||
config: e432c3509163ef03323e39d8537d99ca
|
||||
tags: 645f666f9bcd5a90fca523b33c5a78b7
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -138,6 +138,7 @@ class Attention(nn.Module):
|
||||
disable_deep_gemm: bool = False,
|
||||
attn_output_gate: Optional[bool] = None,
|
||||
use_custom_cublas_mm: bool = False,
|
||||
reduce_output: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the Attention module.
|
||||
@ -234,6 +235,15 @@ class Attention(nn.Module):
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_key_value_heads * self.head_dim
|
||||
|
||||
qkv_shard_indices_mapping = {
|
||||
"q": (0, self.q_size * (2 if self.attn_output_gate else 1)),
|
||||
"k":
|
||||
(self.q_size * (2 if self.attn_output_gate else 1), self.kv_size),
|
||||
"v":
|
||||
(self.q_size * (2 if self.attn_output_gate else 1) + self.kv_size,
|
||||
self.kv_size),
|
||||
}
|
||||
|
||||
self.qkv_proj = Linear(
|
||||
self.hidden_size,
|
||||
tp_size * self.q_size * (2 if self.attn_output_gate else 1) +
|
||||
@ -249,7 +259,8 @@ class Attention(nn.Module):
|
||||
allreduce_strategy=config.allreduce_strategy,
|
||||
force_dynamic_quantization=config.force_dynamic_quantization,
|
||||
disable_deep_gemm=disable_deep_gemm,
|
||||
use_custom_cublas_mm=use_custom_cublas_mm)
|
||||
use_custom_cublas_mm=use_custom_cublas_mm,
|
||||
fused_weight_shard_indices_mapping=qkv_shard_indices_mapping)
|
||||
|
||||
self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
|
||||
[self.hidden_size])
|
||||
@ -264,6 +275,7 @@ class Attention(nn.Module):
|
||||
quant_config=config.get_quant_config(),
|
||||
skip_create_weights_in_init=config.skip_create_weights_in_init,
|
||||
lora=self.o_lora,
|
||||
reduce_output=reduce_output,
|
||||
allreduce_strategy=config.allreduce_strategy,
|
||||
force_dynamic_quantization=config.force_dynamic_quantization,
|
||||
disable_deep_gemm=disable_deep_gemm,
|
||||
@ -370,8 +382,11 @@ class Attention(nn.Module):
|
||||
out_dtype = q.dtype
|
||||
|
||||
if self.attn_backend == "TRTLLM":
|
||||
if self.has_quant_scale and (self.attn.has_fp8_kv_cache
|
||||
or self.attn.has_fp4_kv_cache):
|
||||
# Don't use FP8 output if o_proj has pre_quant_scale - keep BF16 for better precision
|
||||
has_pre_quant_scale = getattr(self.o_proj, 'pre_quant_scale',
|
||||
None) is not None
|
||||
if self.has_quant_scale and not has_pre_quant_scale and (
|
||||
self.attn.has_fp8_kv_cache or self.attn.has_fp4_kv_cache):
|
||||
out_dtype = torch.float8_e4m3fn
|
||||
output = q.new_empty([num_tokens, hidden_size], dtype=out_dtype)
|
||||
return output
|
||||
@ -402,8 +417,18 @@ class Attention(nn.Module):
|
||||
|
||||
out_scale = None
|
||||
out_scale_sf = None
|
||||
if self.has_quant_scale and not self.attn_output_gate:
|
||||
has_awq_pre_quant_scale = hasattr(
|
||||
self.o_proj,
|
||||
'pre_quant_scale') and self.o_proj.pre_quant_scale is not None
|
||||
# Don't set out_scale if o_proj has pre_quant_scale - this prevents FP8/FP4 output
|
||||
# and keeps attention output in BF16 for better precision when applying pre_quant_scale
|
||||
if self.has_quant_scale and not self.attn_output_gate and not has_awq_pre_quant_scale:
|
||||
out_scale = self.o_proj.inv_input_scale
|
||||
if has_awq_pre_quant_scale and enable_attn_nvfp4_output:
|
||||
logger.warning_once(
|
||||
"Disable attn nvfp4 output because o_proj has pre_quant_scale for AWQ.",
|
||||
key="disable_attn_nvfp4_output_for_awq")
|
||||
enable_attn_nvfp4_output = False
|
||||
if self.o_proj.has_nvfp4 and self.support_nvfp4_output and enable_attn_nvfp4_output and not self.attn_output_gate:
|
||||
out_scale_sf = self.o_proj.input_scale
|
||||
|
||||
@ -676,6 +701,8 @@ class MLA(nn.Module):
|
||||
dense_bias: Optional[bool] = None,
|
||||
config: Optional[ModelConfig] = None,
|
||||
enable_unit_test: bool = False,
|
||||
mapping_with_cp: Optional[Mapping] = None,
|
||||
reduce_output: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the MLA module.
|
||||
@ -747,7 +774,12 @@ class MLA(nn.Module):
|
||||
|
||||
# tensor parallel
|
||||
config = config or ModelConfig()
|
||||
self.mapping = config.mapping
|
||||
if mapping_with_cp is not None:
|
||||
logger.warning(
|
||||
"[MLA::__init__] Overriding mapping with CP detected.")
|
||||
self.mapping = mapping_with_cp
|
||||
else:
|
||||
self.mapping = config.mapping
|
||||
tp_size = self.mapping.tp_size
|
||||
pp_size = self.mapping.pp_size
|
||||
cp_size = self.mapping.cp_size
|
||||
@ -755,6 +787,9 @@ class MLA(nn.Module):
|
||||
tp_size = 1
|
||||
if self.mapping.has_cp_ulysses():
|
||||
raise NotImplementedError("MLA doesn't support CP Ulyssees yet")
|
||||
if self.mapping.cp_size > 1:
|
||||
assert self.mapping.has_cp_helix(
|
||||
), f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}."
|
||||
|
||||
mapping = Mapping(
|
||||
world_size=tp_size * pp_size * cp_size,
|
||||
@ -875,6 +910,7 @@ class MLA(nn.Module):
|
||||
tensor_parallel_mode=TensorParallelMode.ROW,
|
||||
quant_config=quant_config,
|
||||
skip_create_weights_in_init=config.skip_create_weights_in_init,
|
||||
reduce_output=reduce_output,
|
||||
allreduce_strategy=config.allreduce_strategy,
|
||||
force_dynamic_quantization=config.force_dynamic_quantization)
|
||||
|
||||
@ -1044,7 +1080,7 @@ class MLA(nn.Module):
|
||||
k: torch.Tensor, v: torch.Tensor,
|
||||
position_ids: Optional[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata, **kwargs):
|
||||
if self.mapping.cp_size > 1:
|
||||
if self.mapping.has_cp_helix():
|
||||
# partial_o: [num_tokens, num_heads_tp * kv_lora_rank]
|
||||
# softmax_stats: [num_tokens, num_heads_tp, 2]
|
||||
softmax_stats = torch.empty((q.shape[0], self.num_heads_tp, 2),
|
||||
@ -1062,24 +1098,20 @@ class MLA(nn.Module):
|
||||
# similar to the post-processing of ring attention
|
||||
kv_lora_rank = partial_o.shape[-1] // self.num_heads_tp
|
||||
assert self.kv_lora_rank == kv_lora_rank
|
||||
chunks_o = [
|
||||
t.contiguous() for t in torch.split(partial_o,
|
||||
partial_o.shape[-1] //
|
||||
self.mapping.cp_size,
|
||||
dim=-1)
|
||||
]
|
||||
chunks_stats = [
|
||||
t.contiguous() for t in torch.split(softmax_stats,
|
||||
softmax_stats.shape[1] //
|
||||
self.mapping.cp_size,
|
||||
dim=1)
|
||||
]
|
||||
gathered_o, gathered_stats = alltoall_helix(
|
||||
chunks_o + chunks_stats,
|
||||
self.mapping.cp_group,
|
||||
)
|
||||
return torch.ops.trtllm.helix_post_process(gathered_o,
|
||||
gathered_stats, 1.0)
|
||||
# transpose the tensors to make the split across cp_size contiguous
|
||||
# for both tensors, we need to split across the second dimension
|
||||
chunks = []
|
||||
for t in [partial_o, softmax_stats]:
|
||||
t = t.transpose(1, 0).contiguous()
|
||||
chunks.extend(torch.split(t,
|
||||
t.shape[0] // self.mapping.cp_size))
|
||||
gathered = alltoall_helix(chunks, self.mapping.cp_group)
|
||||
# transpose the tensors back to ensure dimensions are ordered correctly
|
||||
# note: an additional dimension was added at the first index for all-to-all,
|
||||
# so the transpose dimensions are shifted by 1
|
||||
gathered = [t.transpose(1, 2).contiguous() for t in gathered]
|
||||
return torch.ops.trtllm.helix_post_process(gathered[0], gathered[1],
|
||||
1.0)
|
||||
else:
|
||||
attn_output = attn_backend.forward(q, k, v, attn_metadata, **kwargs)
|
||||
return attn_output
|
||||
@ -1320,7 +1352,8 @@ class MLA(nn.Module):
|
||||
self.qk_rope_head_dim)
|
||||
k = k.view(-1, self.num_heads_tp * self.qk_head_dim)
|
||||
|
||||
helix_position_offsets = position_ids if self.mapping.cp_size > 1 else None
|
||||
helix_position_offsets = position_ids if self.mapping.has_cp_helix(
|
||||
) else None
|
||||
|
||||
attn_output = self.mha.forward(
|
||||
q,
|
||||
@ -1700,6 +1733,12 @@ class MLA(nn.Module):
|
||||
device=q.device,
|
||||
)
|
||||
|
||||
helix_position_offsets, helix_is_inactive_rank = None, None
|
||||
if self.mapping.has_cp_helix():
|
||||
helix_position_offsets = position_ids
|
||||
helix_is_inactive_rank = attn_metadata.helix_is_inactive_rank
|
||||
assert helix_position_offsets is not None and helix_is_inactive_rank is not None, "helix_position_offsets and helix_is_inactive_rank must be provided for helix parallelism."
|
||||
|
||||
rope_stream = self.aux_stream if not has_fp8_kv_cache else None
|
||||
if self.k_b_proj_trans.dtype == torch.bfloat16:
|
||||
# [num_heads, num_tokens, self.qk_nope_head_dim]
|
||||
@ -1714,9 +1753,18 @@ class MLA(nn.Module):
|
||||
lambda: torch.ops.trtllm.bmm_out(
|
||||
q_nope_t, self.k_b_proj_trans.transpose(1, 2), q_nope_out),
|
||||
lambda: self.mqa.mla_rope_generation(
|
||||
fused_q, q_pe, latent_cache, attn_metadata, cu_q_seqlens,
|
||||
cu_kv_seqlens, fmha_scheduler_counter, mla_bmm1_scale,
|
||||
mla_bmm2_scale, quant_q_buffer),
|
||||
fused_q,
|
||||
q_pe,
|
||||
latent_cache,
|
||||
attn_metadata,
|
||||
cu_q_seqlens,
|
||||
cu_kv_seqlens,
|
||||
fmha_scheduler_counter,
|
||||
mla_bmm1_scale,
|
||||
mla_bmm2_scale,
|
||||
quant_q_buffer,
|
||||
helix_position_offsets=helix_position_offsets,
|
||||
helix_is_inactive_rank=helix_is_inactive_rank),
|
||||
self.ln_events[0],
|
||||
self.ln_events[1],
|
||||
rope_stream,
|
||||
@ -1735,9 +1783,18 @@ class MLA(nn.Module):
|
||||
self.k_b_proj_trans_dequant,
|
||||
),
|
||||
lambda: self.mqa.mla_rope_generation(
|
||||
fused_q, q_pe, latent_cache, attn_metadata, cu_q_seqlens,
|
||||
cu_kv_seqlens, fmha_scheduler_counter, mla_bmm1_scale,
|
||||
mla_bmm2_scale, quant_q_buffer),
|
||||
fused_q,
|
||||
q_pe,
|
||||
latent_cache,
|
||||
attn_metadata,
|
||||
cu_q_seqlens,
|
||||
cu_kv_seqlens,
|
||||
fmha_scheduler_counter,
|
||||
mla_bmm1_scale,
|
||||
mla_bmm2_scale,
|
||||
quant_q_buffer,
|
||||
helix_position_offsets=helix_position_offsets,
|
||||
helix_is_inactive_rank=helix_is_inactive_rank),
|
||||
self.ln_events[0],
|
||||
self.ln_events[1],
|
||||
rope_stream,
|
||||
@ -2031,9 +2088,10 @@ class MLA(nn.Module):
|
||||
|
||||
# [seq, num_heads, kv_lora_rank], account for padding
|
||||
attn_out_latent = attn_out_latent[:, :self.num_heads_tp, :]
|
||||
# TODO: seems we need .contiguous() here when padding enabled before pass to bmm?
|
||||
attn_out_latent = attn_out_latent.view(
|
||||
[-1, self.num_heads_tp, self.kv_lora_rank])
|
||||
if self.num_heads_tp != padding:
|
||||
attn_out_latent = attn_out_latent.contiguous()
|
||||
|
||||
assert (attn_out_latent.shape[0] == q.shape[0]
|
||||
and attn_out_latent.shape[1] == self.num_heads_tp)
|
||||
@ -2058,7 +2116,6 @@ class MLA(nn.Module):
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"Missing bmm impl for dtype: {self.v_b_proj.dtype}.")
|
||||
|
||||
return output
|
||||
|
||||
def forward(
|
||||
@ -2089,7 +2146,7 @@ class MLA(nn.Module):
|
||||
output=attn_output,
|
||||
latent_cache_gen=latent_cache_gen)
|
||||
|
||||
if self.enable_unit_test and self.mapping.cp_size > 1:
|
||||
if self.enable_unit_test and self.mapping.has_cp_helix():
|
||||
# note: for allowing testing Helix parallelism, we ensure that
|
||||
# the output is compatible with o_proj even in the context phase,
|
||||
# thus we cut it to num_heads_tp_cp * v_head_dim
|
||||
|
||||
@ -47,8 +47,8 @@ from ..modules.fused_moe.moe_load_balancer import (MoeLoadBalancer,
|
||||
from ..speculative import (SpecMetadata, get_num_extra_kv_tokens,
|
||||
get_spec_metadata,
|
||||
update_spec_config_from_model_config)
|
||||
from ..speculative.drafting_loops import ChainDrafter
|
||||
from ..speculative.eagle3 import Eagle3ResourceManager
|
||||
from ..speculative.drafting_loops import BaseDraftingLoopWrapper
|
||||
from ..speculative.eagle3 import Eagle3ResourceManager, Eagle3SpecMetadata
|
||||
from ..speculative.mtp import SampleStateTensorsMTP
|
||||
from ..speculative.utils import SpecDecodingTensor
|
||||
from ..utils import (get_model_extra_attrs,
|
||||
@ -181,13 +181,18 @@ class PyTorchModelEngine(ModelEngine):
|
||||
|
||||
self.attn_runtime_features = attn_runtime_features or AttentionRuntimeFeatures(
|
||||
)
|
||||
self.input_processor = create_input_processor(model_path, None)
|
||||
|
||||
self.input_processor = create_input_processor(
|
||||
model_path,
|
||||
tokenizer=None,
|
||||
checkpoint_format=llm_args.checkpoint_format)
|
||||
self.input_processor_with_hash = create_input_processor_with_hash(
|
||||
self.input_processor)
|
||||
if model is None:
|
||||
lora_config: Optional[
|
||||
LoraConfig] = None if is_draft_model else llm_args.lora_config
|
||||
loader = ModelLoader(
|
||||
# Keep the model_loader to support reloading the model weights later
|
||||
self.model_loader = ModelLoader(
|
||||
llm_args=llm_args,
|
||||
mapping=self.mapping,
|
||||
spec_config=self.spec_config,
|
||||
@ -196,7 +201,7 @@ class PyTorchModelEngine(ModelEngine):
|
||||
max_seq_len=self.max_seq_len,
|
||||
lora_config=lora_config,
|
||||
)
|
||||
self.model, moe_load_balancer = loader.load(
|
||||
self.model, moe_load_balancer = self.model_loader.load(
|
||||
checkpoint_dir=model_path, checkpoint_loader=checkpoint_loader)
|
||||
if isinstance(moe_load_balancer, MoeLoadBalancer):
|
||||
setattr(self, "moe_load_balancer", moe_load_balancer)
|
||||
@ -278,7 +283,8 @@ class PyTorchModelEngine(ModelEngine):
|
||||
enable_piecewise_cuda_graph=self.
|
||||
_torch_compile_piecewise_cuda_graph,
|
||||
capture_num_tokens=self._piecewise_cuda_graph_num_tokens,
|
||||
max_num_streams=torch_compile_max_num_streams)
|
||||
max_num_streams=torch_compile_max_num_streams,
|
||||
mapping=self.mapping)
|
||||
if isinstance(self.model, DecoderModelForCausalLM):
|
||||
self.model.model = torch.compile(
|
||||
self.model.model,
|
||||
@ -562,12 +568,13 @@ class PyTorchModelEngine(ModelEngine):
|
||||
# Reset the global cuda graph dummy request to None in warmup.
|
||||
self.cuda_graph_runner.padding_dummy_request = None
|
||||
|
||||
# TODO: current warmup_request is not suitable for context parallelism.
|
||||
cp_type = self.mapping.cp_config.get('cp_type', None)
|
||||
if cp_type is not None:
|
||||
logger.info("[ModelEngine::warmup] Skipping warmup for cp_type: ",
|
||||
cp_type.name)
|
||||
return
|
||||
if cp_type in [CpType.ULYSSES, CpType.STAR]:
|
||||
logger.info(
|
||||
"[ModelEngine::warmup] Skipping warmup for cp_type: ",
|
||||
cp_type.name)
|
||||
return
|
||||
|
||||
self._run_torch_compile_warmup(resource_manager)
|
||||
self._run_autotuner_warmup(resource_manager)
|
||||
@ -779,8 +786,8 @@ class PyTorchModelEngine(ModelEngine):
|
||||
|
||||
def _get_num_extra_decoding_steps(self) -> int:
|
||||
"""Determines extra decoding steps needed for fused drafting loops."""
|
||||
if isinstance(self.model, ChainDrafter):
|
||||
return self.model.max_draft_len
|
||||
if isinstance(self.model, BaseDraftingLoopWrapper):
|
||||
return self.model.max_total_draft_tokens
|
||||
else:
|
||||
assert not self.model_is_wrapped, (
|
||||
f"Please add logic to determine num_extra_decoding_steps for drafting loop {type(self.model)}"
|
||||
@ -966,16 +973,16 @@ class PyTorchModelEngine(ModelEngine):
|
||||
cache_indirection = self.cache_indirection_attention if self.attn_backend.Metadata is TrtllmAttentionMetadata else None
|
||||
num_attention_heads = getattr(self.model.model_config.pretrained_config,
|
||||
'num_attention_heads', None)
|
||||
if num_attention_heads is not None:
|
||||
num_key_value_heads = getattr(
|
||||
self.model.model_config.pretrained_config,
|
||||
'num_key_value_heads', None)
|
||||
if num_key_value_heads is not None:
|
||||
num_heads_per_kv = num_attention_heads // num_key_value_heads
|
||||
else:
|
||||
num_heads_per_kv = 1
|
||||
config = self.model.model_config.pretrained_config
|
||||
|
||||
num_attention_heads = getattr(config, 'num_attention_heads', None)
|
||||
num_key_value_heads = getattr(config, 'num_key_value_heads', None)
|
||||
|
||||
if num_attention_heads is not None and num_key_value_heads is not None:
|
||||
num_heads_per_kv = num_attention_heads // num_key_value_heads
|
||||
else:
|
||||
num_heads_per_kv = 1
|
||||
|
||||
if kv_cache_manager is None:
|
||||
return self.attn_backend.Metadata(
|
||||
max_num_requests=self.batch_size,
|
||||
@ -1218,6 +1225,11 @@ class PyTorchModelEngine(ModelEngine):
|
||||
return list(self.dist.tp_allgather(attn_metadata.num_tokens))
|
||||
return None
|
||||
|
||||
def _get_all_rank_ctx_requests(self, num_ctx_requests: int):
|
||||
if self.enable_attention_dp:
|
||||
return list(self.dist.tp_allgather(num_ctx_requests))
|
||||
return None
|
||||
|
||||
def _get_padding_params(
|
||||
self, total_num_tokens: int, num_ctx_requests: int,
|
||||
attn_all_rank_num_tokens: Optional[List[int]]
|
||||
@ -1231,6 +1243,9 @@ class PyTorchModelEngine(ModelEngine):
|
||||
"""
|
||||
padded_num_tokens = total_num_tokens
|
||||
|
||||
all_rank_ctx_requests = self._get_all_rank_ctx_requests(
|
||||
num_ctx_requests)
|
||||
|
||||
def get_padded_piecewise_tokens(tokens):
|
||||
captured_num_tokens = self._torch_compile_backend.capture_num_tokens
|
||||
return captured_num_tokens[bisect.bisect_left(
|
||||
@ -1243,7 +1258,12 @@ class PyTorchModelEngine(ModelEngine):
|
||||
-1]
|
||||
# Torch piecewise cuda graph is enabled.
|
||||
if attn_all_rank_num_tokens is not None:
|
||||
can_run_piecewise_cuda_graph = (num_ctx_requests != 0 and
|
||||
# Any rank has context requests, we enable piecewise cuda graph.
|
||||
has_ctx_requests = num_ctx_requests != 0 or (
|
||||
all_rank_ctx_requests is not None
|
||||
and any(ctx_requests != 0
|
||||
for ctx_requests in all_rank_ctx_requests))
|
||||
can_run_piecewise_cuda_graph = (has_ctx_requests and
|
||||
max(attn_all_rank_num_tokens)
|
||||
<= max_captured_num_tokens)
|
||||
all_ranks_can_run_piecewise_cuda_graph = list(
|
||||
@ -1296,7 +1316,8 @@ class PyTorchModelEngine(ModelEngine):
|
||||
new_tensors_device: Optional[SampleStateTensors] = None,
|
||||
cache_indirection_buffer: Optional[torch.Tensor] = None,
|
||||
num_accepted_tokens_device: Optional[torch.Tensor] = None,
|
||||
req_id_to_old_request: Optional[Dict[int, LlmRequest]] = None):
|
||||
req_id_to_old_request: Optional[Dict[int, LlmRequest]] = None,
|
||||
resource_manager: Optional[ResourceManager] = None):
|
||||
"""
|
||||
Prepare inputs for Pytorch Model.
|
||||
"""
|
||||
@ -1330,6 +1351,9 @@ class PyTorchModelEngine(ModelEngine):
|
||||
multimodal_params_list = []
|
||||
mrope_position_ids = []
|
||||
num_accepted_draft_tokens = [] # per request
|
||||
# if using tree decoding, we need to store the request type and accepted path for each request,
|
||||
# which will be used to update the hidden_states_read_indices.
|
||||
request_accepted_path = {} # per request
|
||||
|
||||
# Variables for updating the inputs of draft model
|
||||
# Base values for gather_ids computation
|
||||
@ -1370,6 +1394,9 @@ class PyTorchModelEngine(ModelEngine):
|
||||
gather_ids.append(len(input_ids) - 1)
|
||||
sequence_lengths.append(len(prompt_tokens))
|
||||
num_accepted_draft_tokens.append(len(prompt_tokens) - 1)
|
||||
request_accepted_path[
|
||||
request.
|
||||
py_request_id] = request.py_num_accepted_draft_tokens_indices
|
||||
prompt_lengths.append(len(prompt_tokens))
|
||||
past_seen_token_num = begin_compute
|
||||
num_cached_tokens_per_seq.append(past_seen_token_num)
|
||||
@ -1444,11 +1471,22 @@ class PyTorchModelEngine(ModelEngine):
|
||||
assert spec_config.spec_dec_mode.support_overlap_scheduler(
|
||||
), f"{spec_config.decoding_type} does not support overlap scheduler"
|
||||
|
||||
spec_resource_manager, spec_tree_manager = None, None
|
||||
if spec_config is not None:
|
||||
spec_resource_manager = resource_manager.get_resource_manager(
|
||||
ResourceManagerType.SPEC_RESOURCE_MANAGER)
|
||||
if spec_resource_manager is not None and hasattr(
|
||||
spec_resource_manager, 'spec_tree_manager'):
|
||||
spec_tree_manager = spec_resource_manager.spec_tree_manager
|
||||
|
||||
# will contain previous batch indices of generation requests
|
||||
previous_batch_indices = []
|
||||
previous_pos_indices = []
|
||||
for request in extend_requests:
|
||||
request_ids.append(request.py_request_id)
|
||||
request_accepted_path[
|
||||
request.
|
||||
py_request_id] = request.py_num_accepted_draft_tokens_indices
|
||||
# the request has no previous tensor:
|
||||
# (1) next_draft_tokens_device is None, which means overlap scheduler is disabled; or
|
||||
# (2) a dummy request; or
|
||||
@ -1466,7 +1504,7 @@ class PyTorchModelEngine(ModelEngine):
|
||||
past_seen_token_num = request.max_beam_num_tokens - 1
|
||||
draft_lens.append(num_draft_tokens)
|
||||
if self.enable_spec_decode and spec_config.spec_dec_mode.extend_ctx(
|
||||
self.attn_backend):
|
||||
self.attn_backend) and spec_config.is_linear_tree:
|
||||
# We're treating the prompt lengths as context requests here, so
|
||||
# the the prompt lens should not include the cached tokens.
|
||||
prompt_lengths.append(1 + num_draft_tokens)
|
||||
@ -1479,10 +1517,20 @@ class PyTorchModelEngine(ModelEngine):
|
||||
list(
|
||||
range(len(position_ids),
|
||||
len(position_ids) + 1 + self.runtime_draft_len)))
|
||||
position_ids.extend(
|
||||
list(
|
||||
range(past_seen_token_num,
|
||||
past_seen_token_num + 1 + num_draft_tokens)))
|
||||
# For the target model + tree decoding
|
||||
if not self.is_draft_model and not spec_config.is_linear_tree:
|
||||
assert spec_tree_manager is not None
|
||||
assert num_draft_tokens == spec_tree_manager.max_total_draft_tokens
|
||||
position_ids.extend(
|
||||
past_seen_token_num +
|
||||
spec_tree_manager.spec_dec_position_offsets[
|
||||
0] # [max_total_draft_tokens + 1]
|
||||
)
|
||||
else:
|
||||
position_ids.extend(
|
||||
list(
|
||||
range(past_seen_token_num,
|
||||
past_seen_token_num + 1 + num_draft_tokens)))
|
||||
num_cached_tokens_per_seq.append(past_seen_token_num)
|
||||
request.cached_tokens = num_cached_tokens_per_seq[-1]
|
||||
# update batch index
|
||||
@ -1502,10 +1550,21 @@ class PyTorchModelEngine(ModelEngine):
|
||||
list(
|
||||
range(len(position_ids),
|
||||
len(position_ids) + 1 + self.runtime_draft_len)))
|
||||
position_ids.extend(
|
||||
list(
|
||||
range(past_seen_token_num, past_seen_token_num + 1 +
|
||||
self.runtime_draft_len)))
|
||||
# For the target model + tree decoding
|
||||
if not self.is_draft_model and not spec_config.is_linear_tree:
|
||||
assert spec_tree_manager is not None
|
||||
assert num_draft_tokens == spec_tree_manager.max_total_draft_tokens
|
||||
position_ids.extend(
|
||||
past_seen_token_num +
|
||||
spec_tree_manager.spec_dec_position_offsets[
|
||||
0] # [max_total_draft_tokens + 1]
|
||||
)
|
||||
else:
|
||||
position_ids.extend(
|
||||
list(
|
||||
range(
|
||||
past_seen_token_num, past_seen_token_num + 1 +
|
||||
self.runtime_draft_len)))
|
||||
# previous tensor
|
||||
previous_batch_indices.append(previous_batch_idx)
|
||||
previous_pos_indices.extend([previous_batch_idx] *
|
||||
@ -1515,7 +1574,7 @@ class PyTorchModelEngine(ModelEngine):
|
||||
self.runtime_draft_len + 1)
|
||||
request.cached_tokens = num_cached_tokens_per_seq[-1]
|
||||
if self.enable_spec_decode and spec_config.spec_dec_mode.extend_ctx(
|
||||
self.attn_backend):
|
||||
self.attn_backend) and spec_config.is_linear_tree:
|
||||
prompt_lengths.append(1 + self.runtime_draft_len)
|
||||
else:
|
||||
prompt_lengths.append(request.py_prompt_len)
|
||||
@ -1563,6 +1622,9 @@ class PyTorchModelEngine(ModelEngine):
|
||||
request.py_num_accepted_draft_tokens)
|
||||
|
||||
sequence_lengths.append(1 + self.original_max_draft_len)
|
||||
request_accepted_path[
|
||||
request.
|
||||
py_request_id] = request.py_num_accepted_draft_tokens_indices
|
||||
prompt_lengths.append(request.py_prompt_len)
|
||||
past_seen_token_num = begin_compute
|
||||
num_cached_tokens_per_seq.append(past_seen_token_num)
|
||||
@ -1570,6 +1632,7 @@ class PyTorchModelEngine(ModelEngine):
|
||||
# update batch index
|
||||
request.py_batch_idx = request.py_seq_slot
|
||||
|
||||
helix_is_inactive_rank = [] if self.mapping.has_cp_helix() else None
|
||||
for request in generation_requests:
|
||||
request_ids.append(request.py_request_id)
|
||||
beam_width = request.sampling_config.beam_width
|
||||
@ -1602,16 +1665,26 @@ class PyTorchModelEngine(ModelEngine):
|
||||
if beam == first_beam:
|
||||
previous_batch_indices.append(request.py_batch_idx)
|
||||
past_seen_token_num = request.max_beam_num_tokens
|
||||
|
||||
position_id = past_seen_token_num
|
||||
if self.mapping.has_cp_helix():
|
||||
# Do an allgather among CP ranks to get the complete sequence length seen by all CP ranks.
|
||||
past_seen_token_nums = self.dist.cp_allgather(
|
||||
past_seen_token_num)
|
||||
position_id = sum(past_seen_token_nums)
|
||||
# Warmup doesn't have `total_input_len_cp` set because merge_helix_requests is not called.
|
||||
if not self.is_warmup and not request.is_cuda_graph_dummy:
|
||||
position_id = request.total_input_len_cp + request.py_decoding_iter - 1
|
||||
# TODO: [TRTLLM-5972] Lift the limitation that last rank is always the active one for helix.
|
||||
if self.mapping.cp_rank == self.mapping.cp_size - 1:
|
||||
past_seen_token_num = request.orig_prompt_len + request.py_decoding_iter - 1
|
||||
else:
|
||||
# past_seen_token_num doesn't grow on inactive ranks.
|
||||
past_seen_token_num = request.orig_prompt_len
|
||||
|
||||
position_ids.append(position_id)
|
||||
num_cached_tokens_per_seq.append(past_seen_token_num)
|
||||
request.cached_tokens = num_cached_tokens_per_seq[-1]
|
||||
prompt_lengths.append(request.py_prompt_len)
|
||||
if self.mapping.has_cp_helix():
|
||||
helix_is_inactive_rank.append(
|
||||
request.py_helix_is_inactive_rank)
|
||||
draft_lens.append(0)
|
||||
sequence_lengths.append(1)
|
||||
num_accepted_draft_tokens.append(0)
|
||||
@ -1660,7 +1733,8 @@ class PyTorchModelEngine(ModelEngine):
|
||||
num_draft_tokens = len(draft_tokens)
|
||||
total_num_tokens = len(position_ids)
|
||||
assert total_num_tokens <= self.max_num_tokens, (
|
||||
"total_num_tokens should be less than or equal to max_num_tokens")
|
||||
f"total_num_tokens ({total_num_tokens}) should be less than or equal to max_num_tokens ({self.max_num_tokens})"
|
||||
)
|
||||
# if exist requests that do not have previous batch, copy input_ids and draft_tokens
|
||||
if num_tokens > 0:
|
||||
input_ids = torch.tensor(input_ids,
|
||||
@ -1941,12 +2015,15 @@ class PyTorchModelEngine(ModelEngine):
|
||||
|
||||
attn_metadata.request_ids = request_ids
|
||||
attn_metadata.prompt_lens = prompt_lengths
|
||||
attn_metadata.helix_is_inactive_rank = helix_is_inactive_rank
|
||||
attn_metadata.num_contexts = len(scheduled_requests.context_requests)
|
||||
# Use num_chunked_ctx_requests to record the number of extend context requests,
|
||||
# so that we can update the kv_lens_cuda correctly in _preprocess_inputs.
|
||||
attn_metadata.num_chunked_ctx_requests = 0
|
||||
if self.enable_spec_decode and spec_config.spec_dec_mode.extend_ctx(
|
||||
self.attn_backend):
|
||||
self.attn_backend) and spec_config.is_linear_tree:
|
||||
# For the tree decoding, we want to use XQA to process the draft tokens for the target model.
|
||||
# Therefore, we do not treat them as the chunked context requests.
|
||||
attn_metadata.num_contexts += len(extend_requests)
|
||||
attn_metadata.num_chunked_ctx_requests = len(extend_requests)
|
||||
|
||||
@ -2010,6 +2087,8 @@ class PyTorchModelEngine(ModelEngine):
|
||||
spec_metadata.seq_lens = sequence_lengths
|
||||
spec_metadata.num_accepted_draft_tokens = self.num_accepted_draft_tokens_cuda[:len(
|
||||
num_accepted_draft_tokens)]
|
||||
if isinstance(spec_metadata, Eagle3SpecMetadata):
|
||||
spec_metadata.request_accepted_path = request_accepted_path
|
||||
spec_metadata.prepare()
|
||||
inputs['spec_metadata'] = spec_metadata
|
||||
|
||||
@ -2516,7 +2595,8 @@ class PyTorchModelEngine(ModelEngine):
|
||||
new_tensors_device: Optional[SampleStateTensors] = None,
|
||||
cache_indirection_buffer: Optional[torch.Tensor] = None,
|
||||
num_accepted_tokens_device: Optional[torch.Tensor] = None,
|
||||
req_id_to_old_request: Optional[Dict[int, LlmRequest]] = None):
|
||||
req_id_to_old_request: Optional[Dict[int, LlmRequest]] = None,
|
||||
resource_manager: Optional[ResourceManager] = None):
|
||||
if self.mapping is not None and 'cp_type' in self.mapping.cp_config:
|
||||
cp_type = self.mapping.cp_config['cp_type']
|
||||
if CpType.STAR == cp_type:
|
||||
@ -2534,7 +2614,7 @@ class PyTorchModelEngine(ModelEngine):
|
||||
new_tensors_device,
|
||||
cache_indirection_buffer,
|
||||
num_accepted_tokens_device,
|
||||
req_id_to_old_request)
|
||||
req_id_to_old_request, resource_manager)
|
||||
|
||||
@torch.inference_mode()
|
||||
@with_model_extra_attrs(lambda self: self.model.extra_attrs)
|
||||
@ -2554,6 +2634,9 @@ class PyTorchModelEngine(ModelEngine):
|
||||
if self.enable_spec_decode:
|
||||
spec_resource_manager = resource_manager.get_resource_manager(
|
||||
ResourceManagerType.SPEC_RESOURCE_MANAGER)
|
||||
spec_tree_manager = None
|
||||
if isinstance(spec_resource_manager, Eagle3ResourceManager):
|
||||
spec_tree_manager = spec_resource_manager.spec_tree_manager
|
||||
spec_metadata = self._set_up_spec_metadata(spec_resource_manager,
|
||||
no_cache=kv_cache_manager
|
||||
is None)
|
||||
@ -2562,9 +2645,16 @@ class PyTorchModelEngine(ModelEngine):
|
||||
spec_resource_manager, self.is_draft_model, self.attn_backend,
|
||||
self.model_is_wrapped, spec_metadata.is_spec_dec_tree)
|
||||
attn_metadata.update_spec_dec_param(
|
||||
is_spec_dec_mode, spec_metadata.is_spec_dec_tree,
|
||||
spec_metadata.is_spec_dec_dynamic_tree,
|
||||
self.original_max_draft_len, spec_decoding_tensor)
|
||||
batch_size=scheduled_requests.batch_size,
|
||||
is_spec_decoding_enabled=is_spec_dec_mode,
|
||||
is_spec_dec_tree=spec_metadata.is_spec_dec_tree,
|
||||
is_spec_dec_dynamic_tree=spec_metadata.is_spec_dec_dynamic_tree,
|
||||
max_draft_len=self.original_max_draft_len,
|
||||
max_total_draft_tokens=self.original_max_total_draft_tokens,
|
||||
model_is_wrapped=self.model_is_wrapped,
|
||||
spec_metadata=spec_metadata,
|
||||
spec_tree_manager=spec_tree_manager,
|
||||
spec_decoding_tensor=spec_decoding_tensor)
|
||||
else:
|
||||
spec_resource_manager = None
|
||||
spec_metadata = None
|
||||
@ -2611,7 +2701,8 @@ class PyTorchModelEngine(ModelEngine):
|
||||
inputs, gather_ids = self._prepare_inputs(
|
||||
padded_requests, kv_cache_manager, attn_metadata, spec_metadata,
|
||||
new_tensors_device, cache_indirection_buffer,
|
||||
num_accepted_tokens_device, req_id_to_old_request)
|
||||
num_accepted_tokens_device, req_id_to_old_request,
|
||||
resource_manager)
|
||||
|
||||
with with_shared_pool(self.cuda_graph_runner.get_graph_pool()):
|
||||
if not can_run_graph:
|
||||
@ -2747,7 +2838,7 @@ class PyTorchModelEngine(ModelEngine):
|
||||
return {'mm_embeddings': mm_embeddings, 'logits': None}
|
||||
|
||||
def _init_userbuffers(self, hidden_size):
|
||||
if self.mapping.tp_size <= 1:
|
||||
if self.mapping.tp_size <= 1 or self.mapping.pp_size > 1:
|
||||
return False
|
||||
|
||||
# Disable UB for unsupported platforms
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -689,9 +694,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1917,9 +1922,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -720,9 +725,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -765,9 +770,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -513,7 +518,6 @@
|
||||
<h1>Source code for tensorrt_llm.executor.result</h1><div class="highlight"><pre>
|
||||
<span></span><span class="kn">import</span><span class="w"> </span><span class="nn">asyncio</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">json</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">threading</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">weakref</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span>
|
||||
@ -528,12 +532,11 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi</span><span class="w"> </span><span class="kn">import</span> <span class="n">tracing</span>
|
||||
|
||||
<span class="k">try</span><span class="p">:</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">ray</span>
|
||||
<span class="k">pass</span>
|
||||
<span class="k">except</span> <span class="ne">ModuleNotFoundError</span><span class="p">:</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm</span><span class="w"> </span><span class="kn">import</span> <span class="n">ray_stub</span> <span class="k">as</span> <span class="n">ray</span>
|
||||
<span class="k">pass</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._ray_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">unwrap_ray_errors</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_disabled</span><span class="p">,</span> <span class="n">nvtx_range_debug</span><span class="p">,</span> <span class="n">ray_use_rpc</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">nvtx_range_debug</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings</span><span class="w"> </span><span class="kn">import</span> <span class="n">executor</span> <span class="k">as</span> <span class="n">tllm</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..disaggregated_params</span><span class="w"> </span><span class="kn">import</span> <span class="n">DisaggregatedParams</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..llmapi.tracer</span><span class="w"> </span><span class="kn">import</span> <span class="n">global_tracer</span>
|
||||
@ -676,104 +679,12 @@
|
||||
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">warmup_tensorrt_llm</span><span class="p">():</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">tensorrt_llm</span>
|
||||
<span class="nb">print</span><span class="p">(</span><span class="s2">"Warmup by importing tensorrt_llm with version"</span><span class="p">,</span>
|
||||
<span class="n">tensorrt_llm</span><span class="o">.</span><span class="n">version</span><span class="o">.</span><span class="n">__version__</span><span class="p">)</span>
|
||||
|
||||
|
||||
<span class="nd">@ray</span><span class="o">.</span><span class="n">remote</span><span class="p">(</span><span class="n">max_concurrency</span><span class="o">=</span><span class="mi">1000000</span><span class="p">,</span> <span class="n">num_cpus</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">RayAsyncQueue</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""Ray actor for async response handling."""</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="p">{}</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span> <span class="o">=</span> <span class="p">{}</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">warmup_done</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">register</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="n">key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Key </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2"> already registered"</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">asyncio</span><span class="o">.</span><span class="n">Event</span><span class="p">()</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">unregister</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">:</span>
|
||||
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">:</span>
|
||||
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">warmup</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">warmup_done</span><span class="p">:</span>
|
||||
<span class="k">return</span>
|
||||
<span class="n">warmup_tensorrt_llm</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">warmup_done</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">put_response</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Key </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2"> not registered"</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">item</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">()</span>
|
||||
|
||||
<span class="k">async</span> <span class="k">def</span><span class="w"> </span><span class="nf">get_async</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Key </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2"> not registered"</span>
|
||||
<span class="k">await</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">wait</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
|
||||
<span class="n">ret</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
<span class="k">return</span> <span class="n">ret</span>
|
||||
|
||||
|
||||
<span class="n">SYNC_QUEUE_MAX_CONCURRENCY</span> <span class="o">=</span> <span class="mi">2</span>
|
||||
|
||||
|
||||
<span class="nd">@ray</span><span class="o">.</span><span class="n">remote</span><span class="p">(</span><span class="n">max_concurrency</span><span class="o">=</span><span class="n">SYNC_QUEUE_MAX_CONCURRENCY</span><span class="p">,</span>
|
||||
<span class="n">num_cpus</span><span class="o">=</span><span class="n">SYNC_QUEUE_MAX_CONCURRENCY</span><span class="p">)</span>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">RaySyncQueue</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""Ray actor for sync response handling."""</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">data</span> <span class="o">=</span> <span class="p">{}</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span> <span class="o">=</span> <span class="p">{}</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">semaphore</span> <span class="o">=</span> <span class="n">threading</span><span class="o">.</span><span class="n">Semaphore</span><span class="p">(</span><span class="n">SYNC_QUEUE_MAX_CONCURRENCY</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">warmup_done</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">register</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="n">key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"Key </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2"> already registered"</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">threading</span><span class="o">.</span><span class="n">Event</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">unregister</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">:</span>
|
||||
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">:</span>
|
||||
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">warmup</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">warmup_done</span><span class="p">:</span>
|
||||
<span class="k">return</span>
|
||||
<span class="n">warmup_tensorrt_llm</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">warmup_done</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">put_response</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Any</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">item</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">set</span><span class="p">()</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
|
||||
<span class="k">with</span> <span class="bp">self</span><span class="o">.</span><span class="n">semaphore</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">wait</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">event_map</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
|
||||
<span class="n">ret</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
<span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
<span class="k">return</span> <span class="n">ret</span>
|
||||
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">GenerationResultBase</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">''' This holds the core logic of the GenerationResult class. '''</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
||||
<span class="nb">id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
|
||||
<span class="n">sampling_params</span><span class="p">:</span> <span class="n">SamplingParams</span><span class="p">,</span>
|
||||
<span class="n">ray_queue</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">RayAsyncQueue</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">background_error_handler</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">postproc_params</span><span class="p">:</span> <span class="s2">"Optional[PostprocParams]"</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">id</span> <span class="o">=</span> <span class="nb">id</span>
|
||||
@ -791,22 +702,12 @@
|
||||
<span class="c1"># torch backend will use trtllm sampler in beam search mode, but it does not support return logprobs incrementally</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">use_trtllm_sampler</span> <span class="o">=</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="mi">1</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">ray_queue</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">ray_use_rpc</span><span class="p">():</span>
|
||||
<span class="k">if</span> <span class="n">has_event_loop</span><span class="p">():</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span> <span class="o">=</span> <span class="n">ray_queue</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="n">ray_queue</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="k">with</span> <span class="n">unwrap_ray_errors</span><span class="p">():</span>
|
||||
<span class="n">ray</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">register</span><span class="o">.</span><span class="n">remote</span><span class="p">(</span><span class="nb">id</span><span class="p">))</span>
|
||||
<span class="k">if</span> <span class="n">has_event_loop</span><span class="p">():</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span> <span class="o">=</span> <span class="n">AsyncQueue</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span><span class="o">.</span><span class="n">sync_q</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">has_event_loop</span><span class="p">():</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span> <span class="o">=</span> <span class="n">AsyncQueue</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span><span class="o">.</span><span class="n">sync_q</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="n">Queue</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span> <span class="o">=</span> <span class="n">Queue</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="c1"># In Sampling mode, the Executor runtime will return best_of sequences</span>
|
||||
<span class="c1"># in total, which the LLM API will select the n-best sequences among</span>
|
||||
@ -1073,12 +974,6 @@
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Unknown response type: </span><span class="si">{</span><span class="n">response</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_done</span> <span class="ow">and</span> <span class="n">mpi_disabled</span><span class="p">()</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">ray_use_rpc</span><span class="p">():</span>
|
||||
<span class="k">assert</span> <span class="nb">hasattr</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="p">,</span> <span class="s2">"unregister"</span>
|
||||
<span class="p">),</span> <span class="s2">"Ray path should be activated for unregistering the Ray queue."</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">unregister</span><span class="o">.</span><span class="n">remote</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">id</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">record_stats</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
||||
<span class="n">output</span><span class="p">:</span> <span class="n">CompletionOutput</span><span class="p">,</span>
|
||||
<span class="n">stats</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
@ -1277,7 +1172,6 @@
|
||||
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">finish_reason</span> <span class="o">=</span> <span class="s1">'stop'</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">stop_reason</span> <span class="o">=</span> <span class="n">stop_reason</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">abort</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_done</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="k">break</span>
|
||||
|
||||
@ -1304,15 +1198,9 @@
|
||||
<span class="n">disaggregated_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">DisaggregatedParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">logprob_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">LogprobParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">use_async_queue</span> <span class="o">=</span> <span class="n">has_event_loop</span><span class="p">()</span>
|
||||
<span class="n">shared_queue</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="k">if</span> <span class="n">executor</span> <span class="ow">and</span> <span class="n">executor</span><span class="o">.</span><span class="n">use_ray_queue</span><span class="p">()</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">ray_use_rpc</span><span class="p">():</span>
|
||||
<span class="n">shared_queue</span> <span class="o">=</span> <span class="n">executor</span><span class="o">.</span><span class="n">async_response_queue_weakref</span> <span class="k">if</span> <span class="n">use_async_queue</span> <span class="k">else</span> <span class="n">executor</span><span class="o">.</span><span class="n">sync_response_queue_weakref</span>
|
||||
|
||||
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span>
|
||||
<span class="n">generation_request</span><span class="o">.</span><span class="n">id</span><span class="p">,</span>
|
||||
<span class="n">generation_request</span><span class="o">.</span><span class="n">sampling_params</span><span class="p">,</span>
|
||||
<span class="n">shared_queue</span><span class="p">,</span>
|
||||
<span class="n">background_error_handler</span><span class="p">,</span>
|
||||
<span class="n">postproc_params</span><span class="o">=</span><span class="n">generation_request</span><span class="o">.</span><span class="n">postproc_params</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
@ -1371,22 +1259,12 @@
|
||||
<span class="k">return</span> <span class="n">response</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_result_step</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">timeout</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="n">mpi_disabled</span><span class="p">()</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">ray_use_rpc</span><span class="p">():</span>
|
||||
<span class="k">with</span> <span class="n">unwrap_ray_errors</span><span class="p">():</span>
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="n">ray</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">get</span><span class="o">.</span><span class="n">remote</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">request_id</span><span class="p">))</span>
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_ray_response</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
|
||||
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">queue</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_handle_response</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
|
||||
|
||||
<span class="k">async</span> <span class="k">def</span><span class="w"> </span><span class="nf">_aresult_step</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"The asyncio event loop was not present during initialization, so async operations are not available."</span>
|
||||
<span class="k">if</span> <span class="n">mpi_disabled</span><span class="p">()</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">ray_use_rpc</span><span class="p">():</span>
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="k">await</span> <span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span><span class="o">.</span><span class="n">get_async</span><span class="o">.</span><span class="n">remote</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">request_id</span><span class="p">)</span>
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_ray_response</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="k">await</span> <span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
|
||||
<span class="n">response</span> <span class="o">=</span> <span class="k">await</span> <span class="bp">self</span><span class="o">.</span><span class="n">aqueue</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
|
||||
<span class="n">global_tracer</span><span class="p">()</span><span class="o">.</span><span class="n">log_instant</span><span class="p">(</span><span class="s2">"result_step.get"</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_handle_response</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
|
||||
|
||||
@ -1717,9 +1595,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -796,9 +801,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -4085,8 +4090,6 @@
|
||||
<span class="w"> </span><span class="sd">'''</span>
|
||||
<span class="sd"> Add an identity operation.</span>
|
||||
|
||||
<span class="sd"> TODO: Document why it can be done using a plugin!!!</span>
|
||||
|
||||
<span class="sd"> Parameters:</span>
|
||||
<span class="sd"> input : Tensor</span>
|
||||
<span class="sd"> The input tensor.</span>
|
||||
@ -8775,9 +8778,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -647,9 +652,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -3510,9 +3515,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -654,9 +659,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -903,9 +908,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1370,9 +1375,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1218,9 +1223,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1244,9 +1249,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1008,9 +1013,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -663,9 +668,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -946,9 +951,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -521,7 +526,7 @@
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">weakref</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">collections.abc</span><span class="w"> </span><span class="kn">import</span> <span class="n">Mapping</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Literal</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Union</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Literal</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">cast</span>
|
||||
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">transformers</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tqdm</span><span class="w"> </span><span class="kn">import</span> <span class="n">tqdm</span>
|
||||
@ -530,7 +535,8 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_disabled</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.data</span><span class="w"> </span><span class="kn">import</span> <span class="n">TextPrompt</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.multimodal</span><span class="w"> </span><span class="kn">import</span> <span class="n">MultimodalInput</span><span class="p">,</span> <span class="n">MultimodalParams</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.registry</span><span class="w"> </span><span class="kn">import</span> <span class="n">DefaultInputProcessor</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.registry</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">BaseMultimodalInputProcessor</span><span class="p">,</span>
|
||||
<span class="n">DefaultInputProcessor</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi</span><span class="w"> </span><span class="kn">import</span> <span class="n">tracing</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.metrics.enums</span><span class="w"> </span><span class="kn">import</span> <span class="n">MetricNames</span>
|
||||
|
||||
@ -654,6 +660,9 @@
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">set_level</span><span class="p">(</span><span class="s2">"info"</span><span class="p">)</span> <span class="c1"># force display the backend</span>
|
||||
|
||||
<span class="k">try</span><span class="p">:</span>
|
||||
<span class="n">env_overrides</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"env_overrides"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_process_env_overrides</span><span class="p">(</span><span class="n">env_overrides</span><span class="p">)</span>
|
||||
|
||||
<span class="n">backend</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'backend'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Using LLM with PyTorch backend"</span><span class="p">)</span>
|
||||
@ -974,8 +983,10 @@
|
||||
<span class="n">inputs</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="s1">'multi_modal_embeddings'</span> <span class="ow">in</span> <span class="n">inputs</span><span class="p">:</span>
|
||||
<span class="n">mm_embedding_info</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="s1">'multi_modal_embeddings'</span><span class="p">]</span>
|
||||
<span class="n">prompt_token_ids</span><span class="p">,</span> <span class="n">extra_processed_inputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="o">.</span><span class="n">attach_multimodal_embeddings</span><span class="p">(</span>
|
||||
<span class="n">inputs</span><span class="p">,</span> <span class="n">mm_embedding_info</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
|
||||
<span class="n">prompt_token_ids</span><span class="p">,</span> <span class="n">extra_processed_inputs</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">,</span>
|
||||
<span class="n">BaseMultimodalInputProcessor</span><span class="p">)</span><span class="o">.</span><span class="n">attach_multimodal_embeddings</span><span class="p">(</span>
|
||||
<span class="n">inputs</span><span class="p">,</span> <span class="n">mm_embedding_info</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">with</span> <span class="n">nvtx_range_debug</span><span class="p">(</span><span class="s2">"input_processor"</span><span class="p">):</span>
|
||||
<span class="n">prompt_token_ids</span><span class="p">,</span> <span class="n">extra_processed_inputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">(</span>
|
||||
@ -1106,6 +1117,25 @@
|
||||
<span class="sd"> '''</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">aget_kv_events</span><span class="p">(</span><span class="n">timeout</span><span class="o">=</span><span class="n">timeout</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_process_env_overrides</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
||||
<span class="n">env_overrides</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">env_overrides</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">return</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">"Processing LLM API environment variable overrides"</span><span class="p">)</span>
|
||||
<span class="c1"># TODO: If an env var is cached at import-time in code, overriding os.environ will</span>
|
||||
<span class="c1"># unfortunately not update wherever the var is used.</span>
|
||||
<span class="c1"># This is a known issue and only way to fix it is at every such usage to access it</span>
|
||||
<span class="c1"># from os.environ on-demand.</span>
|
||||
<span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">env_overrides</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
|
||||
<span class="n">str_value</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">value</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">:</span>
|
||||
<span class="n">old_value</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
|
||||
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">str_value</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Overriding </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2">: '</span><span class="si">{</span><span class="n">old_value</span><span class="si">}</span><span class="s2">' -> '</span><span class="si">{</span><span class="n">str_value</span><span class="si">}</span><span class="s2">'"</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">str_value</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Setting </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2">='</span><span class="si">{</span><span class="n">str_value</span><span class="si">}</span><span class="s2">'"</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_prepare_sampling_params</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="p">,</span>
|
||||
<span class="n">sampling_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">SamplingParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">SamplingParams</span><span class="p">:</span>
|
||||
@ -1285,6 +1315,17 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span><span class="o">.</span><span class="n">shutdown</span><span class="p">()</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">mpi_session</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_check_health</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""Check if the LLM is healthy.</span>
|
||||
|
||||
<span class="sd"> Returns:</span>
|
||||
<span class="sd"> bool: True if the executor is running and not shutdown, False otherwise.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_executor"</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="o">.</span><span class="n">is_shutdown</span><span class="p">()</span>
|
||||
|
||||
<span class="k">return</span> <span class="kc">False</span>
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_shutdown_wrapper</span><span class="p">(</span><span class="n">self_ref</span><span class="p">):</span>
|
||||
<span class="c1"># Retrieve the instance if it still exists</span>
|
||||
@ -1761,9 +1802,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -746,14 +751,19 @@
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">algorithm</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"rocket"</span>
|
||||
<span class="n">window_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"The window size for snap KV."</span><span class="p">)</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="mi">32</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"The window size for snap KV."</span><span class="p">)</span>
|
||||
<span class="n">kernel_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"The kernel size for snap KV."</span><span class="p">)</span>
|
||||
<span class="n">topr</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">76</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Top-r"</span><span class="p">)</span>
|
||||
<span class="n">topk</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Top-k"</span><span class="p">)</span>
|
||||
<span class="n">prompt_budget</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">1266</span><span class="p">,</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="mi">63</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"The kernel size for snap KV."</span><span class="p">)</span>
|
||||
<span class="n">topr</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Top-r"</span><span class="p">)</span>
|
||||
<span class="n">topk</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">64</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Top-k"</span><span class="p">)</span>
|
||||
<span class="n">prompt_budget</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">2048</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Prompt budget"</span><span class="p">)</span>
|
||||
<span class="n">page_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Page size"</span><span class="p">)</span>
|
||||
<span class="n">page_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Page size"</span><span class="p">)</span>
|
||||
<span class="n">kt_cache_dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="s1">'float8_e5m2'</span><span class="p">,</span>
|
||||
<span class="n">choices</span><span class="o">=</span><span class="p">[</span><span class="s1">'bfloat16'</span><span class="p">,</span> <span class="s1">'float8_e5m2'</span><span class="p">],</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"KT cache dtype"</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<div class="viewcode-block" id="RocketSparseAttentionConfig.from_dict">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.RocketSparseAttentionConfig.from_dict">[docs]</a>
|
||||
@ -953,6 +963,34 @@
|
||||
|
||||
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">Nvfp4GemmConfig</span><span class="p">(</span><span class="n">StrictBaseModel</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Configuration for NVFP4 GEMM backend selection.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">allowed_backends</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="p">[</span><span class="s1">'cutlass'</span><span class="p">,</span> <span class="s1">'cublaslt'</span><span class="p">,</span> <span class="s1">'cuda_core'</span><span class="p">],</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"List of backends to consider for auto-selection. "</span>
|
||||
<span class="s2">"Default excludes 'cutedsl' for faster build time. "</span>
|
||||
<span class="s2">"Add 'cutedsl' for extreme performance at the cost of longer server launch time. "</span>
|
||||
<span class="s2">"Valid values: 'cutlass', 'cublaslt', 'cutedsl', 'cuda_core'."</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_allowed_backends</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s1">'Nvfp4GemmConfig'</span><span class="p">:</span>
|
||||
<span class="n">valid_backends</span> <span class="o">=</span> <span class="p">{</span><span class="s1">'cutlass'</span><span class="p">,</span> <span class="s1">'cublaslt'</span><span class="p">,</span> <span class="s1">'cutedsl'</span><span class="p">,</span> <span class="s1">'cuda_core'</span><span class="p">}</span>
|
||||
<span class="n">invalid</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">allowed_backends</span><span class="p">)</span> <span class="o">-</span> <span class="n">valid_backends</span>
|
||||
<span class="k">if</span> <span class="n">invalid</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Invalid backends in allowed_backends: </span><span class="si">{</span><span class="n">invalid</span><span class="si">}</span><span class="s2">. "</span>
|
||||
<span class="sa">f</span><span class="s2">"Valid backends are: </span><span class="si">{</span><span class="nb">sorted</span><span class="p">(</span><span class="n">valid_backends</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">allowed_backends</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"allowed_backends cannot be empty."</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="AttentionDpConfig">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.AttentionDpConfig">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">AttentionDpConfig</span><span class="p">(</span><span class="n">StrictBaseModel</span><span class="p">):</span>
|
||||
@ -1261,6 +1299,10 @@
|
||||
<span class="k">return</span> <span class="n">TorchSpeculativeDecodingMode</span><span class="o">.</span><span class="n">from_string</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_type</span><span class="o">.</span><span class="n">upper</span><span class="p">())</span>
|
||||
|
||||
<span class="nd">@functools</span><span class="o">.</span><span class="n">cached_property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">is_linear_tree</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_total_draft_tokens</span>
|
||||
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">KvCacheConnectorConfig</span><span class="p">(</span><span class="n">StrictBaseModel</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
@ -1599,7 +1641,7 @@
|
||||
<div class="viewcode-block" id="DraftTargetDecodingConfig.supports_backend">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.DraftTargetDecodingConfig.supports_backend">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">backend</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span></div>
|
||||
<span class="k">return</span> <span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span> <span class="ow">or</span> <span class="n">backend</span> <span class="o">==</span> <span class="s2">"_autodeploy"</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
@ -2630,6 +2672,12 @@
|
||||
<span class="n">status</span><span class="o">=</span><span class="s2">"prototype"</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">env_overrides</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"[EXPERIMENTAL] Environment variable overrides. NOTE: import-time-cached env vars in the code won’t update unless the code fetches them from os.environ on demand."</span><span class="p">,</span>
|
||||
<span class="n">status</span><span class="o">=</span><span class="s2">"prototype"</span><span class="p">)</span>
|
||||
|
||||
<span class="n">_parallel_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">_ParallelConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">_model_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">_ModelFormatKind</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">_speculative_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||||
@ -2753,86 +2801,6 @@
|
||||
<span class="n">use_fast</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer_mode</span> <span class="o">!=</span> <span class="s1">'slow'</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_model_format_misc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">'''</span>
|
||||
<span class="sd"> Load the model format, and do the following:</span>
|
||||
|
||||
<span class="sd"> 1. Load the build_config if got an engine.</span>
|
||||
<span class="sd"> 2. Load the parallel_config if got a checkpoint.</span>
|
||||
<span class="sd"> '''</span>
|
||||
<span class="n">model_obj</span> <span class="o">=</span> <span class="n">_ModelWrapper</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">model_obj</span><span class="o">.</span><span class="n">is_local_model</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span>
|
||||
<span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span>
|
||||
<span class="p">]:</span>
|
||||
<span class="c1"># Load parallel_config from the engine.</span>
|
||||
<span class="n">model_format</span> <span class="o">=</span> <span class="n">get_model_format</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">,</span> <span class="n">trust_remote_code</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">trust_remote_code</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">model_format</span> <span class="ow">is</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">TLLM_ENGINE</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"The build_config is ignored for model format of TLLM_ENGINE."</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_load_config_from_engine</span><span class="p">(</span><span class="n">model_obj</span><span class="o">.</span><span class="n">model_dir</span><span class="p">)</span>
|
||||
<span class="n">runtime_defaults</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pretrained_config</span><span class="o">.</span><span class="n">runtime_defaults</span>
|
||||
<span class="k">if</span> <span class="n">runtime_defaults</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">fill_empty_fields_from_runtime_defaults</span><span class="p">(</span>
|
||||
<span class="n">runtime_defaults</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Load parallel_config from the checkpoint.</span>
|
||||
<span class="k">elif</span> <span class="n">model_format</span> <span class="ow">is</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">TLLM_CKPT</span><span class="p">:</span>
|
||||
<span class="c1"># We need to create a temporary instance to call _load_config_from_ckpt</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_load_config_from_ckpt</span><span class="p">(</span><span class="n">model_obj</span><span class="o">.</span><span class="n">model_dir</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
|
||||
|
||||
<span class="c1"># Store the model format in the values</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_model_format</span> <span class="o">=</span> <span class="n">model_format</span>
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">init_build_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Creating a default BuildConfig if none is provided</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">build_config</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"build_config"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">build_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span> <span class="o">=</span> <span class="p">{}</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_batch_size"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_num_tokens"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_seq_len"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_beam_width"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_input_len"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="o">=</span> <span class="n">BuildConfig</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">set_runtime_knobs_from_build_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="c1"># TODO: remove this after PyT become default to adapt PyT with build_config as input</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"build_config is not initialized"</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">:</span>
|
||||
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="p">[</span>
|
||||
<span class="s2">"max_batch_size"</span><span class="p">,</span> <span class="s2">"max_num_tokens"</span><span class="p">,</span> <span class="s2">"max_seq_len"</span><span class="p">,</span>
|
||||
<span class="s2">"max_input_len"</span><span class="p">,</span> <span class="s2">"max_beam_width"</span>
|
||||
<span class="p">]:</span>
|
||||
<span class="k">if</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="p">(</span><span class="n">v</span> <span class="o">:=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span>
|
||||
<span class="kc">None</span><span class="p">))</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">v</span> <span class="o">!=</span> <span class="nb">getattr</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">,</span> <span class="n">key</span><span class="p">):</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"overriding </span><span class="si">{</span><span class="n">key</span><span class="si">}</span><span class="s2"> from build_config"</span><span class="p">)</span>
|
||||
<span class="nb">setattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">,</span> <span class="n">key</span><span class="p">))</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_runtime_args</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
@ -2842,181 +2810,6 @@
|
||||
<span class="p">)</span>
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_build_config_with_runtime_params</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="c1"># Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,</span>
|
||||
<span class="c1"># which will be passed to the C++ Executor API, overwriting the values</span>
|
||||
<span class="c1"># from an built engine. In order to set build configuration, it is</span>
|
||||
<span class="c1"># recommended to use build_config instead.</span>
|
||||
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">,</span> <span class="n">BuildConfig</span>
|
||||
<span class="p">),</span> <span class="sa">f</span><span class="s2">"build_config is not initialized: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="si">}</span><span class="s2">"</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_batch_size [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_batch_size [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_num_tokens [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_num_tokens [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_seq_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_seq_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_beam_width [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_beam_width [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_input_len</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_input_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_input_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_input_len</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_build_config_remaining</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="n">is_trt_llm_args</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># TODO: remove the checker when manage weights support all data types</span>
|
||||
<span class="k">if</span> <span class="n">is_trt_llm_args</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">fast_build</span> <span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span>
|
||||
<span class="ow">is</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">FP8</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">manage_weights</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">world_size</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">nccl_plugin</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_lora</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">!=</span> <span class="s1">'pytorch'</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">lora_plugin</span> <span class="o">=</span> <span class="s1">'auto'</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_lora_rank</span>
|
||||
|
||||
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
||||
<span class="s1">'enable_prompt_adapter'</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_prompt_adapter</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_prompt_embedding_table_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_prompt_adapter_token</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_speculative_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Speculation type </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">decoding_type</span><span class="si">}</span><span class="s2"> does not "</span>
|
||||
<span class="sa">f</span><span class="s2">"support backend </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Below, we only need to set speculative_decoding_mode/decoding_config for speculation</span>
|
||||
<span class="c1"># on the TRT backend.</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">LookaheadDecodingConfig</span><span class="p">):</span>
|
||||
<span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">calculate_speculative_resource</span><span class="p">(</span>
|
||||
<span class="p">)[</span><span class="mi">2</span><span class="p">]</span>
|
||||
<span class="k">assert</span> <span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">LOOKAHEAD_DECODING</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span> <span class="n">max_draft_len</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Lookahead</span><span class="p">(),</span>
|
||||
<span class="n">lookahead_decoding_config</span><span class="o">=</span><span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">))</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">MEDUSA</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Medusa</span><span class="p">(),</span>
|
||||
<span class="n">medusa_choices</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">medusa_choices</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">speculative_model_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Path to EAGLE3 weights must be specified."</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span><span class="p">]:</span>
|
||||
<span class="n">eagle_config</span> <span class="o">=</span> <span class="n">_EagleConfig</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">eagle_choices</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">greedy_sampling</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">posterior_threshold</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">use_dynamic_tree</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">dynamic_tree_max_topK</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Eagle</span><span class="p">(),</span>
|
||||
<span class="n">eagle_config</span><span class="o">=</span><span class="n">eagle_config</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span><span class="p">]</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_matching_ngram_size</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">DraftTargetDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">]</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">speculative_model_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Path to draft model must be specified."</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">DRAFT_TOKENS_EXTERNAL</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MTPDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="n">UserProvidedDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span><span class="p">]</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">USER_PROVIDED</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">AutoDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span><span class="p">]</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">AUTO</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="n">SaveHiddenStatesDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">]</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"SaveHiddenStatesDecodingConfig is active, setting max_batch_size to 1, disabling overlap scheduler, and setting cuda_graph_config to None"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">disable_overlap_scheduler</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">SAVE_HIDDEN_STATES</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Unrecognized speculative config type </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="s2">"speculative_model_dir"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_model_obj</span> <span class="o">=</span> <span class="n">_ModelWrapper</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span>
|
||||
<span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="ow">and</span> <span class="n">speculative_model_obj</span><span class="o">.</span><span class="n">is_local_model</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_lora_config_consistency</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="p">:</span>
|
||||
@ -3054,66 +2847,6 @@
|
||||
<span class="s2">"while LoRA prefetch is not supported"</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_load_config_from_engine</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">engine_dir</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
|
||||
<span class="n">engine_config</span> <span class="o">=</span> <span class="n">EngineConfig</span><span class="o">.</span><span class="n">from_json_file</span><span class="p">(</span><span class="n">engine_dir</span> <span class="o">/</span> <span class="s2">"config.json"</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_pretrained_config</span> <span class="o">=</span> <span class="n">engine_config</span><span class="o">.</span><span class="n">pretrained_config</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="o">=</span> <span class="n">engine_config</span><span class="o">.</span><span class="n">build_config</span>
|
||||
|
||||
<span class="c1"># load and check parallel_config</span>
|
||||
<span class="n">mapping</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pretrained_config</span><span class="o">.</span><span class="n">mapping</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"tp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span><span class="si">}</span><span class="s2"> is not consistent with the engine's tp_size </span><span class="si">{</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"pp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span><span class="si">}</span><span class="s2"> is not consistent with the engine's pp_size </span><span class="si">{</span><span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"cp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2"> is not consistent with the engine's cp_size </span><span class="si">{</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_parallel_config</span> <span class="o">=</span> <span class="n">_ParallelConfig</span><span class="p">(</span>
|
||||
<span class="n">tp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span>
|
||||
<span class="n">pp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="p">,</span>
|
||||
<span class="n">gpus_per_node</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">gpus_per_node</span><span class="p">,</span>
|
||||
<span class="n">moe_cluster_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_cluster_size</span><span class="p">,</span>
|
||||
<span class="n">moe_tp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_tp_size</span><span class="p">,</span>
|
||||
<span class="n">moe_ep_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_ep_size</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_load_config_from_ckpt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ckpt_dir</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
|
||||
<span class="n">pretrained_config</span> <span class="o">=</span> <span class="n">PretrainedConfig</span><span class="o">.</span><span class="n">from_json_file</span><span class="p">(</span><span class="n">ckpt_dir</span> <span class="o">/</span>
|
||||
<span class="s2">"config.json"</span><span class="p">)</span>
|
||||
<span class="n">tp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span>
|
||||
<span class="n">pp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span>
|
||||
<span class="n">cp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span>
|
||||
<span class="n">moe_cluster_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_cluster_size</span>
|
||||
<span class="n">moe_tp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_tp_size</span>
|
||||
<span class="n">moe_ep_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_ep_size</span>
|
||||
<span class="n">gpus_per_node</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">gpus_per_node</span>
|
||||
<span class="c1"># load parallel_config</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">!=</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">!=</span> <span class="n">tp_size</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"tp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span><span class="si">}</span><span class="s2"> is not consistent with the checkpoint's tp_size </span><span class="si">{</span><span class="n">tp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span> <span class="o">!=</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span> <span class="o">!=</span> <span class="n">pp_size</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"pp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span><span class="si">}</span><span class="s2"> is not consistent with the checkpoint's pp_size </span><span class="si">{</span><span class="n">pp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">!=</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">!=</span> <span class="n">cp_size</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"cp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2"> is not consistent with the checkpoint's cp_size </span><span class="si">{</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_parallel_config</span> <span class="o">=</span> <span class="n">_ParallelConfig</span><span class="p">(</span>
|
||||
<span class="n">tp_size</span><span class="o">=</span><span class="n">tp_size</span><span class="p">,</span>
|
||||
<span class="n">pp_size</span><span class="o">=</span><span class="n">pp_size</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="o">=</span><span class="n">cp_size</span><span class="p">,</span>
|
||||
<span class="n">gpus_per_node</span><span class="o">=</span><span class="n">gpus_per_node</span><span class="p">,</span>
|
||||
<span class="n">moe_cluster_size</span><span class="o">=</span><span class="n">moe_cluster_size</span><span class="p">,</span>
|
||||
<span class="n">moe_tp_size</span><span class="o">=</span><span class="n">moe_tp_size</span><span class="p">,</span>
|
||||
<span class="n">moe_ep_size</span><span class="o">=</span><span class="n">moe_ep_size</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_runtime_sizes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span>
|
||||
<span class="k">return</span> <span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">,</span>
|
||||
@ -3179,6 +2912,272 @@
|
||||
<span class="n">_convert_checkpoint_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span>
|
||||
<span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">)</span>
|
||||
|
||||
<div class="viewcode-block" id="TrtLlmArgs.init_build_config">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.init_build_config">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">init_build_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Creating a default BuildConfig if none is provided</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">build_config</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"build_config"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">build_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span> <span class="o">=</span> <span class="p">{}</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_batch_size"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_num_tokens"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_seq_len"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_beam_width"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span><span class="p">:</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s2">"max_input_len"</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="o">=</span> <span class="n">BuildConfig</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TrtLlmArgs.validate_build_config_with_runtime_params">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.validate_build_config_with_runtime_params">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_build_config_with_runtime_params</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="c1"># Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,</span>
|
||||
<span class="c1"># which will be passed to the C++ Executor API, overwriting the values</span>
|
||||
<span class="c1"># from an built engine. In order to set build configuration, it is</span>
|
||||
<span class="c1"># recommended to use build_config instead.</span>
|
||||
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">,</span> <span class="n">BuildConfig</span>
|
||||
<span class="p">),</span> <span class="sa">f</span><span class="s2">"build_config is not initialized: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="si">}</span><span class="s2">"</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_batch_size [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_batch_size [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_num_tokens [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_num_tokens [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_seq_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_seq_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_beam_width [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_beam_width [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_input_len</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"max_input_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_input_len</span><span class="si">}</span><span class="s2">] is overridden by build_config.max_input_len [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_input_len</span><span class="si">}</span><span class="s2">] in build_config"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TrtLlmArgs.validate_build_config_remaining">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.validate_build_config_remaining">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_build_config_remaining</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="n">is_trt_llm_args</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># TODO: remove the checker when manage weights support all data types</span>
|
||||
<span class="k">if</span> <span class="n">is_trt_llm_args</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">fast_build</span> <span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span>
|
||||
<span class="ow">is</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">FP8</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">manage_weights</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">world_size</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">nccl_plugin</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_lora</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">!=</span> <span class="s1">'pytorch'</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">lora_plugin</span> <span class="o">=</span> <span class="s1">'auto'</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_lora_rank</span>
|
||||
|
||||
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
||||
<span class="s1">'enable_prompt_adapter'</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_prompt_adapter</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_prompt_embedding_table_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_prompt_adapter_token</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TrtLlmArgs.validate_speculative_config">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.validate_speculative_config">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_speculative_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Speculation type </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">decoding_type</span><span class="si">}</span><span class="s2"> does not "</span>
|
||||
<span class="sa">f</span><span class="s2">"support backend </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Below, we only need to set speculative_decoding_mode/decoding_config for speculation</span>
|
||||
<span class="c1"># on the TRT backend.</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">LookaheadDecodingConfig</span><span class="p">):</span>
|
||||
<span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">calculate_speculative_resource</span><span class="p">(</span>
|
||||
<span class="p">)[</span><span class="mi">2</span><span class="p">]</span>
|
||||
<span class="k">assert</span> <span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">LOOKAHEAD_DECODING</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span> <span class="n">max_draft_len</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Lookahead</span><span class="p">(),</span>
|
||||
<span class="n">lookahead_decoding_config</span><span class="o">=</span><span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">))</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">MEDUSA</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Medusa</span><span class="p">(),</span>
|
||||
<span class="n">medusa_choices</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">medusa_choices</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">speculative_model_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Path to EAGLE3 weights must be specified."</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE</span>
|
||||
<span class="n">eagle_config</span> <span class="o">=</span> <span class="n">_EagleConfig</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">eagle_choices</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">greedy_sampling</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">posterior_threshold</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">use_dynamic_tree</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">dynamic_tree_max_topK</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Eagle</span><span class="p">(),</span>
|
||||
<span class="n">eagle_config</span><span class="o">=</span><span class="n">eagle_config</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Unrecognized speculative config type </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="s2">"speculative_model_dir"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_model_obj</span> <span class="o">=</span> <span class="n">_ModelWrapper</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span>
|
||||
<span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="ow">and</span> <span class="n">speculative_model_obj</span><span class="o">.</span><span class="n">is_local_model</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_load_config_from_engine</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">engine_dir</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
|
||||
<span class="n">engine_config</span> <span class="o">=</span> <span class="n">EngineConfig</span><span class="o">.</span><span class="n">from_json_file</span><span class="p">(</span><span class="n">engine_dir</span> <span class="o">/</span> <span class="s2">"config.json"</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_pretrained_config</span> <span class="o">=</span> <span class="n">engine_config</span><span class="o">.</span><span class="n">pretrained_config</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="o">=</span> <span class="n">engine_config</span><span class="o">.</span><span class="n">build_config</span>
|
||||
|
||||
<span class="c1"># load and check parallel_config</span>
|
||||
<span class="n">mapping</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pretrained_config</span><span class="o">.</span><span class="n">mapping</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"tp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span><span class="si">}</span><span class="s2"> is not consistent with the engine's tp_size </span><span class="si">{</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"pp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span><span class="si">}</span><span class="s2"> is not consistent with the engine's pp_size </span><span class="si">{</span><span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"cp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2"> is not consistent with the engine's cp_size </span><span class="si">{</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_parallel_config</span> <span class="o">=</span> <span class="n">_ParallelConfig</span><span class="p">(</span>
|
||||
<span class="n">tp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span>
|
||||
<span class="n">pp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span><span class="p">,</span>
|
||||
<span class="n">gpus_per_node</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">gpus_per_node</span><span class="p">,</span>
|
||||
<span class="n">moe_cluster_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_cluster_size</span><span class="p">,</span>
|
||||
<span class="n">moe_tp_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_tp_size</span><span class="p">,</span>
|
||||
<span class="n">moe_ep_size</span><span class="o">=</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_ep_size</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_load_config_from_ckpt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ckpt_dir</span><span class="p">:</span> <span class="n">Path</span><span class="p">):</span>
|
||||
<span class="n">pretrained_config</span> <span class="o">=</span> <span class="n">PretrainedConfig</span><span class="o">.</span><span class="n">from_json_file</span><span class="p">(</span><span class="n">ckpt_dir</span> <span class="o">/</span>
|
||||
<span class="s2">"config.json"</span><span class="p">)</span>
|
||||
<span class="n">tp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span>
|
||||
<span class="n">pp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">pp_size</span>
|
||||
<span class="n">cp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">cp_size</span>
|
||||
<span class="n">moe_cluster_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_cluster_size</span>
|
||||
<span class="n">moe_tp_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_tp_size</span>
|
||||
<span class="n">moe_ep_size</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">moe_ep_size</span>
|
||||
<span class="n">gpus_per_node</span> <span class="o">=</span> <span class="n">pretrained_config</span><span class="o">.</span><span class="n">mapping</span><span class="o">.</span><span class="n">gpus_per_node</span>
|
||||
<span class="c1"># load parallel_config</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">!=</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">!=</span> <span class="n">tp_size</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"tp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">tp_size</span><span class="si">}</span><span class="s2"> is not consistent with the checkpoint's tp_size </span><span class="si">{</span><span class="n">tp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span> <span class="o">!=</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span> <span class="o">!=</span> <span class="n">pp_size</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"pp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">pp_size</span><span class="si">}</span><span class="s2"> is not consistent with the checkpoint's pp_size </span><span class="si">{</span><span class="n">pp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">!=</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">!=</span> <span class="n">cp_size</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"cp_size </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2"> is not consistent with the checkpoint's cp_size </span><span class="si">{</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_parallel_config</span> <span class="o">=</span> <span class="n">_ParallelConfig</span><span class="p">(</span>
|
||||
<span class="n">tp_size</span><span class="o">=</span><span class="n">tp_size</span><span class="p">,</span>
|
||||
<span class="n">pp_size</span><span class="o">=</span><span class="n">pp_size</span><span class="p">,</span>
|
||||
<span class="n">cp_size</span><span class="o">=</span><span class="n">cp_size</span><span class="p">,</span>
|
||||
<span class="n">gpus_per_node</span><span class="o">=</span><span class="n">gpus_per_node</span><span class="p">,</span>
|
||||
<span class="n">moe_cluster_size</span><span class="o">=</span><span class="n">moe_cluster_size</span><span class="p">,</span>
|
||||
<span class="n">moe_tp_size</span><span class="o">=</span><span class="n">moe_tp_size</span><span class="p">,</span>
|
||||
<span class="n">moe_ep_size</span><span class="o">=</span><span class="n">moe_ep_size</span><span class="p">)</span>
|
||||
|
||||
<div class="viewcode-block" id="TrtLlmArgs.validate_model_format_misc">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.validate_model_format_misc">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_model_format_misc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">'''</span>
|
||||
<span class="sd"> Load the model format, and do the following:</span>
|
||||
|
||||
<span class="sd"> 1. Load the build_config if got an engine.</span>
|
||||
<span class="sd"> 2. Load the parallel_config if got a checkpoint.</span>
|
||||
<span class="sd"> '''</span>
|
||||
<span class="n">model_obj</span> <span class="o">=</span> <span class="n">_ModelWrapper</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">model_obj</span><span class="o">.</span><span class="n">is_local_model</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span>
|
||||
<span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span>
|
||||
<span class="p">]:</span>
|
||||
<span class="c1"># Load parallel_config from the engine.</span>
|
||||
<span class="n">model_format</span> <span class="o">=</span> <span class="n">get_model_format</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">,</span> <span class="n">trust_remote_code</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">trust_remote_code</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">model_format</span> <span class="ow">is</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">TLLM_ENGINE</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"The build_config is ignored for model format of TLLM_ENGINE."</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_load_config_from_engine</span><span class="p">(</span><span class="n">model_obj</span><span class="o">.</span><span class="n">model_dir</span><span class="p">)</span>
|
||||
<span class="n">runtime_defaults</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pretrained_config</span><span class="o">.</span><span class="n">runtime_defaults</span>
|
||||
<span class="k">if</span> <span class="n">runtime_defaults</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">fill_empty_fields_from_runtime_defaults</span><span class="p">(</span>
|
||||
<span class="n">runtime_defaults</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Load parallel_config from the checkpoint.</span>
|
||||
<span class="k">elif</span> <span class="n">model_format</span> <span class="ow">is</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">TLLM_CKPT</span><span class="p">:</span>
|
||||
<span class="c1"># We need to create a temporary instance to call _load_config_from_ckpt</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_load_config_from_ckpt</span><span class="p">(</span><span class="n">model_obj</span><span class="o">.</span><span class="n">model_dir</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
|
||||
|
||||
<span class="c1"># Store the model format in the values</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_model_format</span> <span class="o">=</span> <span class="n">model_format</span>
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TrtLlmArgs.init_calib_config">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.init_calib_config">[docs]</a>
|
||||
<span class="nd">@field_validator</span><span class="p">(</span><span class="s1">'calib_config'</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">'before'</span><span class="p">)</span>
|
||||
@ -3320,14 +3319,6 @@
|
||||
<div class="viewcode-block" id="TorchLlmArgs">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">TorchLlmArgs</span><span class="p">(</span><span class="n">BaseLlmArgs</span><span class="p">):</span>
|
||||
<span class="c1"># Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs</span>
|
||||
<span class="n">build_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">BuildConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Build config."</span><span class="p">,</span>
|
||||
<span class="n">exclude_from_json</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">status</span><span class="o">=</span><span class="s2">"deprecated"</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="c1"># PyTorch backend specific configurations</span>
|
||||
<span class="n">garbage_collection_gen0_threshold</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="mi">20000</span><span class="p">,</span>
|
||||
@ -3360,6 +3351,11 @@
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"MoE config."</span><span class="p">,</span>
|
||||
<span class="n">status</span><span class="o">=</span><span class="s2">"beta"</span><span class="p">)</span>
|
||||
|
||||
<span class="n">nvfp4_gemm_config</span><span class="p">:</span> <span class="n">Nvfp4GemmConfig</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default_factory</span><span class="o">=</span><span class="n">Nvfp4GemmConfig</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"NVFP4 GEMM backend config."</span><span class="p">,</span>
|
||||
<span class="n">status</span><span class="o">=</span><span class="s2">"beta"</span><span class="p">)</span>
|
||||
|
||||
<span class="n">attn_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">'TRTLLM'</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Attention backend to use."</span><span class="p">,</span>
|
||||
<span class="n">status</span><span class="o">=</span><span class="s2">"beta"</span><span class="p">)</span>
|
||||
@ -3512,8 +3508,12 @@
|
||||
<span class="c1"># PrivateVars</span>
|
||||
<span class="n">_quant_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">QuantConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||||
|
||||
<span class="n">_disable_flash_infer_sampling</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
<span class="w"> </span><span class="sd">"""Unless this is set to False, FlashInfer.sampling is not used, even if available."""</span>
|
||||
<span class="n">disable_flashinfer_sampling</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"Disable the use of FlashInfer.sampling. This option is likely to be removed in the future."</span><span class="p">,</span>
|
||||
<span class="n">status</span><span class="o">=</span><span class="s2">"prototype"</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">quant_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">QuantConfig</span><span class="p">:</span>
|
||||
@ -3564,6 +3564,73 @@
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">extra_resource_managers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_extra_resource_managers</span> <span class="o">=</span> <span class="n">value</span>
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs.validate_misc">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_misc">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_misc</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">=</span> <span class="mi">2048</span>
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs.validate_speculative_config">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_speculative_config">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_speculative_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Speculation type </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">decoding_type</span><span class="si">}</span><span class="s2"> does not "</span>
|
||||
<span class="sa">f</span><span class="s2">"support backend </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">speculative_model_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Path to EAGLE3 weights must be specified."</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_matching_ngram_size</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">DraftTargetDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">speculative_model_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Path to draft model must be specified."</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MTPDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="n">UserProvidedDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">pass</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">AutoDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">pass</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="n">SaveHiddenStatesDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">]</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="s2">"SaveHiddenStatesDecodingConfig is active, setting max_batch_size to 1, disabling overlap scheduler, and setting cuda_graph_config to None"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">disable_overlap_scheduler</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Unrecognized speculative config type </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="s2">"speculative_model_dir"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_model_obj</span> <span class="o">=</span> <span class="n">_ModelWrapper</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span>
|
||||
<span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="ow">and</span> <span class="n">speculative_model_obj</span><span class="o">.</span><span class="n">is_local_model</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs.validate_stream_interval">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_stream_interval">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
@ -3807,6 +3874,15 @@
|
||||
<span class="n">llm_args_dict</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span>
|
||||
<span class="n">extra_llm_api_options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">:</span>
|
||||
|
||||
<span class="c1"># Deep merge kv_cache_config to prevent partial YAML kv_cache_config from replacing the complete kv_cache_config</span>
|
||||
<span class="k">if</span> <span class="s1">'kv_cache_config'</span> <span class="ow">in</span> <span class="n">llm_args</span> <span class="ow">and</span> <span class="s1">'kv_cache_config'</span> <span class="ow">in</span> <span class="n">llm_args_dict</span><span class="p">:</span>
|
||||
<span class="c1"># Convert KvCacheConfig object to dict if necessary</span>
|
||||
<span class="n">base_kv_config</span> <span class="o">=</span> <span class="n">llm_args</span><span class="p">[</span><span class="s1">'kv_cache_config'</span><span class="p">]</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">base_kv_config</span><span class="p">,</span> <span class="n">KvCacheConfig</span><span class="p">):</span>
|
||||
<span class="n">base_kv_config</span> <span class="o">=</span> <span class="n">base_kv_config</span><span class="o">.</span><span class="n">model_dump</span><span class="p">(</span><span class="n">exclude_unset</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
<span class="n">llm_args_dict</span><span class="p">[</span><span class="s1">'kv_cache_config'</span><span class="p">]</span> <span class="o">=</span> <span class="n">base_kv_config</span> <span class="o">|</span> <span class="n">llm_args_dict</span><span class="p">[</span>
|
||||
<span class="s1">'kv_cache_config'</span><span class="p">]</span>
|
||||
|
||||
<span class="n">field_mapping</span> <span class="o">=</span> <span class="p">{</span>
|
||||
<span class="s2">"quant_config"</span><span class="p">:</span> <span class="n">QuantConfig</span><span class="p">,</span>
|
||||
<span class="s2">"calib_config"</span><span class="p">:</span> <span class="n">CalibConfig</span><span class="p">,</span>
|
||||
@ -3816,8 +3892,10 @@
|
||||
<span class="s2">"speculative_config"</span><span class="p">:</span> <span class="n">DecodingBaseConfig</span><span class="p">,</span>
|
||||
<span class="s2">"lora_config"</span><span class="p">:</span> <span class="n">LoraConfig</span><span class="p">,</span>
|
||||
<span class="s2">"moe_config"</span><span class="p">:</span> <span class="n">MoeConfig</span><span class="p">,</span>
|
||||
<span class="s2">"nvfp4_gemm_config"</span><span class="p">:</span> <span class="n">Nvfp4GemmConfig</span><span class="p">,</span>
|
||||
<span class="s2">"attention_dp_config"</span><span class="p">:</span> <span class="n">AttentionDpConfig</span><span class="p">,</span>
|
||||
<span class="s2">"sparse_attention_config"</span><span class="p">:</span> <span class="n">BaseSparseAttentionConfig</span><span class="p">,</span>
|
||||
<span class="s2">"kv_cache_config"</span><span class="p">:</span> <span class="n">KvCacheConfig</span><span class="p">,</span>
|
||||
<span class="p">}</span>
|
||||
<span class="k">for</span> <span class="n">field_name</span><span class="p">,</span> <span class="n">field_type</span> <span class="ow">in</span> <span class="n">field_mapping</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
|
||||
<span class="k">if</span> <span class="n">field_name</span> <span class="ow">in</span> <span class="n">llm_args_dict</span><span class="p">:</span>
|
||||
@ -3833,8 +3911,7 @@
|
||||
|
||||
<span class="n">llm_args</span> <span class="o">=</span> <span class="n">llm_args</span> <span class="o">|</span> <span class="n">llm_args_dict</span>
|
||||
|
||||
<span class="c1"># For trtllm-bench or trtllm-serve, build_config may be passed for the PyTorch</span>
|
||||
<span class="c1"># backend, overwriting the knobs there since build_config always has the highest priority</span>
|
||||
<span class="c1"># build_config only works for TensorRT backend, it will be ignored in PyTorch backend</span>
|
||||
<span class="k">if</span> <span class="s2">"build_config"</span> <span class="ow">in</span> <span class="n">llm_args</span><span class="p">:</span>
|
||||
<span class="c1"># Ensure build_config is a BuildConfig object, not a dict</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">llm_args</span><span class="p">[</span><span class="s2">"build_config"</span><span class="p">],</span> <span class="nb">dict</span><span class="p">):</span>
|
||||
@ -4010,9 +4087,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -773,9 +778,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1075,6 +1080,13 @@
|
||||
<span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">getsockname</span><span class="p">()[</span><span class="mi">1</span><span class="p">]</span>
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">find_free_ipc_addr</span><span class="p">()</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">tempfile</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">uuid</span>
|
||||
<span class="k">return</span> <span class="sa">f</span><span class="s1">'ipc://</span><span class="si">{</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">tempfile</span><span class="o">.</span><span class="n">gettempdir</span><span class="p">(),</span><span class="w"> </span><span class="s2">"rpc_"</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">()))</span><span class="si">}</span><span class="s1">'</span>
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_mpi_world_size</span><span class="p">()</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||||
<span class="c1"># avoid cyclic import</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..executor.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_spawn_proxy_process_env</span>
|
||||
@ -1236,9 +1248,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -881,9 +886,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1185,9 +1190,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -793,9 +798,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -810,9 +815,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1009,9 +1014,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -838,9 +843,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -669,9 +674,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -922,9 +927,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -820,9 +825,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -684,9 +689,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -810,9 +815,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -904,9 +909,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -986,9 +991,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1022,9 +1027,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1958,9 +1963,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -2865,9 +2870,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -745,9 +750,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -907,9 +912,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -835,9 +840,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1030,9 +1035,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -954,9 +959,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1057,9 +1062,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -683,9 +688,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -833,9 +838,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -775,9 +780,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -909,9 +914,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1257,9 +1262,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1102,9 +1107,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -742,9 +747,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -892,9 +897,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -2203,9 +2208,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1269,9 +1274,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -2678,9 +2683,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -807,9 +812,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -741,9 +746,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -809,9 +814,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -812,9 +817,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -854,9 +859,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -950,9 +955,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1255,9 +1260,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -942,9 +947,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1142,13 +1147,44 @@
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">max_workspace_size_auto</span><span class="p">(</span><span class="n">tp_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
|
||||
<span class="n">support_deterministic</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""Calculate workspace size for allreduce fusion kernel.</span>
|
||||
|
||||
<span class="sd"> The workspace is used for lamport buffers in the fusion kernel.</span>
|
||||
<span class="sd"> Required size calculation:</span>
|
||||
<span class="sd"> - Each GPU needs 3 sub-buffers (for triple buffering)</span>
|
||||
<span class="sd"> - Each sub-buffer stores: max_num_tokens * hidden_size * dtype_size (bf16=2)</span>
|
||||
<span class="sd"> - The lamport allocation multiplies by tp_size, so:</span>
|
||||
<span class="sd"> lamport_size = 3 * size * tp_size (per GPU)</span>
|
||||
|
||||
<span class="sd"> Example: Llama 8B (hidden=4096), max_tokens=8192, bf16, TP=4</span>
|
||||
<span class="sd"> - Data per sub-buffer: 8192 * 4096 * 2 = 64 MiB</span>
|
||||
<span class="sd"> - Total lamport: 3 * 64MB * 4 = 768 MiB per GPU</span>
|
||||
<span class="sd"> - Required 'size' parameter: 64 MiB (gets multiplied by tp_size in allocation)</span>
|
||||
|
||||
<span class="sd"> Default (67,108,864 = 64 MiB) supports:</span>
|
||||
<span class="sd"> - Models up to hidden_size=4096 with max_num_tokens=8192</span>
|
||||
<span class="sd"> - Or hidden_size=8192 with max_num_tokens=4096</span>
|
||||
|
||||
<span class="sd"> Override with TRTLLM_ALLREDUCE_FUSION_WORKSPACE_SIZE env var if needed for larger models.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">if</span> <span class="n">force_all_reduce_deterministic</span><span class="p">()</span> <span class="ow">and</span> <span class="n">support_deterministic</span><span class="p">:</span>
|
||||
<span class="n">workspace_size</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">"FORCE_ALLREDUCE_KERNEL_WORKSPACE_SIZE"</span><span class="p">,</span>
|
||||
<span class="s2">"1000000000"</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="nb">int</span><span class="p">(</span><span class="n">workspace_size</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">tp_size</span> <span class="o"><=</span> <span class="mi">2</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="mi">16_000_000</span>
|
||||
<span class="k">return</span> <span class="mi">8_000_000</span>
|
||||
|
||||
<span class="c1"># Allow override via environment variable for edge cases</span>
|
||||
<span class="n">workspace_size_env</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">"TRTLLM_ALLREDUCE_FUSION_WORKSPACE_SIZE"</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">workspace_size_env</span><span class="p">:</span>
|
||||
<span class="n">size</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">workspace_size_env</span><span class="p">)</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Using custom allreduce fusion workspace size: </span><span class="si">{</span><span class="n">size</span><span class="si">}</span><span class="s2"> bytes (</span><span class="si">{</span><span class="n">size</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="p">(</span><span class="mi">1024</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2"> MiB)"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">return</span> <span class="n">size</span>
|
||||
|
||||
<span class="c1"># Default: 64 MiB - supports most common model configurations</span>
|
||||
<span class="c1"># Increase via env var if you see CUDA illegal memory access errors with large models</span>
|
||||
<span class="n">default_size</span> <span class="o">=</span> <span class="mi">67_108_864</span> <span class="c1"># Exactly 64 MiB</span>
|
||||
<span class="k">return</span> <span class="n">default_size</span>
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">max_workspace_size_lowprecision</span><span class="p">(</span><span class="n">tp_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||||
@ -1399,9 +1435,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -559,6 +564,7 @@
|
||||
<span class="n">W4A8_MXFP4_FP8</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">W4A8_MXFP4_MXFP8</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">W4A16_MXFP4</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">NVFP4_AWQ</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">NO_QUANT</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span></div>
|
||||
|
||||
|
||||
@ -928,6 +934,9 @@
|
||||
<span class="n">quant_mode</span> <span class="o">=</span> <span class="n">QuantMode</span><span class="o">.</span><span class="n">from_description</span><span class="p">(</span><span class="n">use_fp8_block_scales</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="n">quant_algo</span> <span class="o">==</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">NVFP4</span><span class="p">:</span>
|
||||
<span class="n">quant_mode</span> <span class="o">=</span> <span class="n">QuantMode</span><span class="o">.</span><span class="n">from_description</span><span class="p">(</span><span class="n">use_nvfp4</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="n">quant_algo</span> <span class="o">==</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">NVFP4_AWQ</span><span class="p">:</span>
|
||||
<span class="c1"># NVFP4_AWQ uses the same QuantMode as NVFP4, distinction is at QuantAlgo level</span>
|
||||
<span class="n">quant_mode</span> <span class="o">=</span> <span class="n">QuantMode</span><span class="o">.</span><span class="n">from_description</span><span class="p">(</span><span class="n">use_nvfp4</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="n">quant_algo</span> <span class="o">==</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">W4A8_NVFP4_FP8</span><span class="p">:</span>
|
||||
<span class="n">quant_mode</span> <span class="o">=</span> <span class="n">QuantMode</span><span class="o">.</span><span class="n">from_description</span><span class="p">(</span><span class="n">use_w4a8_nvfp4_fp8</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="n">quant_algo</span> <span class="o">==</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">W4A8_MXFP4_FP8</span><span class="p">:</span>
|
||||
@ -1100,9 +1109,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1898,9 +1903,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1169,9 +1174,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -5509,9 +5514,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1113,9 +1118,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1632,9 +1637,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1845,9 +1850,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -3427,9 +3432,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -973,9 +978,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc4';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc5';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -73,7 +73,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.2.0rc4" />
|
||||
<meta name="docsearch:version" content="1.2.0rc5" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -369,7 +369,9 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
|
||||
</ul>
|
||||
</details></li>
|
||||
</ul>
|
||||
@ -410,11 +412,14 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/quantization.html">Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/sampling.html">Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/additional-outputs.html">Additional Outputs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/guided-decoding.html">Guided Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/torch_compile_and_piecewise_cuda_graph.html">Torch Compile & Piecewise CUDA Graph</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/helix.html">Helix Parallelism</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../features/kv-cache-connector.html">KV Cache Connector</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -916,6 +921,25 @@
|
||||
<span class="n">strs</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">stop</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span> <span class="o">=</span> <span class="p">[</span><span class="n">_encode</span><span class="p">(</span><span class="n">tokenizer</span><span class="p">,</span> <span class="n">s</span><span class="p">,</span> <span class="n">add_special_tokens</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">strs</span><span class="p">]</span>
|
||||
|
||||
<span class="c1"># add generation_config to stop word list, only in qwen3-next now</span>
|
||||
<span class="k">if</span> <span class="p">(</span>
|
||||
<span class="n">hf_model_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
|
||||
<span class="ow">and</span> <span class="n">hf_model_config</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s2">"qwen3_next"</span>
|
||||
<span class="ow">and</span> <span class="n">generation_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
|
||||
<span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">generation_config</span><span class="o">.</span><span class="n">eos_token_id</span><span class="p">,</span> <span class="n">List</span><span class="p">)</span>
|
||||
<span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">generation_config</span><span class="o">.</span><span class="n">eos_token_id</span><span class="p">)</span>
|
||||
<span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span><span class="p">:</span>
|
||||
<span class="n">all_stop_tokens_id</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">i</span> <span class="k">for</span> <span class="n">sublist</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">sublist</span><span class="p">)</span>
|
||||
<span class="n">from_generation_stop_tokens</span> <span class="o">=</span> <span class="p">[</span>
|
||||
<span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">generation_config</span><span class="o">.</span><span class="n">eos_token_id</span> <span class="k">if</span> <span class="n">i</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">all_stop_tokens_id</span>
|
||||
<span class="p">]</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">from_generation_stop_tokens</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">from_generation_stop_tokens</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_stop_word_ids</span> <span class="o">=</span> <span class="p">[</span><span class="n">generation_config</span><span class="o">.</span><span class="n">eos_token_id</span><span class="p">]</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_get_bad_words</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]:</span>
|
||||
@ -1168,9 +1192,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on November 23, 2025.</p>
|
||||
<p>Last updated on December 07, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a761585">a761585</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e4c7078">e4c7078</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -4,6 +4,24 @@ Executor
|
||||
.. Here are files in the cpp/include/executor
|
||||
.. We manually add subsection to enable detailed description in the future
|
||||
.. It is also doable to automatically generate this file and list all the modules in the conf.py
|
||||
transferAgent.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: transferAgent.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
types.h
|
||||
_______
|
||||
|
||||
.. doxygenfile:: types.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cacheCommunicator.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: cacheCommunicator.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
disaggServerUtil.h
|
||||
__________________
|
||||
|
||||
@ -16,24 +34,6 @@ ________
|
||||
.. doxygenfile:: tensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
transferAgent.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: transferAgent.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
serialization.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: serialization.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
types.h
|
||||
_______
|
||||
|
||||
.. doxygenfile:: types.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
executor.h
|
||||
__________
|
||||
|
||||
@ -46,9 +46,9 @@ ______________________
|
||||
.. doxygenfile:: dataTransceiverState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cacheCommunicator.h
|
||||
___________________
|
||||
serialization.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: cacheCommunicator.h
|
||||
.. doxygenfile:: serialization.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
|
||||
@ -4,148 +4,22 @@ Runtime
|
||||
.. Here are files in the cpp/include/runtime
|
||||
.. We manually add subsection to enable detailed description in the future
|
||||
.. It is also doable to automatically generate this file and list all the modules in the conf.py
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadModule.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: lookaheadModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iBuffer.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iBuffer.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
modelConfig.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: modelConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingOutput.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: decodingOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
promptTuningParams.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: promptTuningParams.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
bufferManager.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: bufferManager.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptJsonConfig.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: gptJsonConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCache.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: loraCache.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
rawEngine.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: rawEngine.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoder.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: gptDecoder.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
medusaModule.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: medusaModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
virtualMemory.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: virtualMemory.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
explicitDraftTokensBuffers.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: explicitDraftTokensBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iTensor.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iTensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
common.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: common.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCachePageManagerConfig.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: loraCachePageManagerConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
worldConfig.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: worldConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraModule.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: loraModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingMode.h
|
||||
_________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingMode.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaEvent.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: cudaEvent.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingInput.h
|
||||
virtualMemory.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: decodingInput.h
|
||||
.. doxygenfile:: virtualMemory.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingModule.h
|
||||
@ -154,40 +28,10 @@ ___________________________
|
||||
.. doxygenfile:: speculativeDecodingModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iGptDecoderBatched.h
|
||||
____________________
|
||||
common.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: iGptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleModule.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: eagleModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
tllmLogger.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: tllmLogger.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoderBatched.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: gptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaStream.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: cudaStream.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcNvlsMemory.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: ipcNvlsMemory.h
|
||||
.. doxygenfile:: common.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
samplingConfig.h
|
||||
@ -196,16 +40,136 @@ ________________
|
||||
.. doxygenfile:: samplingConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
tllmLogger.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: tllmLogger.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadModule.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: lookaheadModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
modelConfig.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: modelConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iGptDecoderBatched.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: iGptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaStream.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: cudaStream.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCache.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: loraCache.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
medusaModule.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: medusaModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decoderState.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: decoderState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcUtils.h
|
||||
__________
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: ipcUtils.h
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleModule.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: eagleModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingOutput.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: decodingOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingInput.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: decodingInput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
worldConfig.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: worldConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoderBatched.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: gptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
explicitDraftTokensBuffers.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: explicitDraftTokensBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
bufferManager.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: bufferManager.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraModule.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: loraModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingMode.h
|
||||
_________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingMode.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
promptTuningParams.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: promptTuningParams.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoder.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: gptDecoder.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
memoryCounters.h
|
||||
@ -214,3 +178,39 @@ ________________
|
||||
.. doxygenfile:: memoryCounters.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcNvlsMemory.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: ipcNvlsMemory.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
rawEngine.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: rawEngine.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcUtils.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: ipcUtils.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iBuffer.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iBuffer.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptJsonConfig.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: gptJsonConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCachePageManagerConfig.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: loraCachePageManagerConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
|
||||
@ -28,7 +28,7 @@ TensorRT LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x
|
||||
|
||||
<sub>FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT LLM v0.5.0., TensorRT 9.1</sub>
|
||||
|
||||
The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html)
|
||||
The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/0.21.0/performance/perf-overview.html)
|
||||
|
||||
Stay tuned for a highlight on Llama coming soon!
|
||||
|
||||
|
||||
@ -21,7 +21,7 @@ TensorRT LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news
|
||||
|
||||
<sup>*(1) Largest batch supported on given TP configuration by power of 2.*</sup> <sup>*(2) TP = Tensor Parallelism*</sup>
|
||||
|
||||
Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html).
|
||||
Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/0.21.0/performance/perf-overview.html).
|
||||
|
||||
### H200 vs H100
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ In the Dynamo workflow, requests are initially processed by pre- and post-proces
|
||||
|
||||
Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments.
|
||||
|
||||
For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html).
|
||||
For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/backends/trtllm/README.html).
|
||||
|
||||
### Triton Inference Server
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ TensorRT LLM distributes the pre-built container on [NGC Catalog](https://catalo
|
||||
You can launch the container using the following command:
|
||||
|
||||
```bash
|
||||
docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4
|
||||
docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5
|
||||
```
|
||||
|
||||
|
||||
@ -161,34 +161,36 @@ P99 E2EL (ms): 1643.44
|
||||
|
||||
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
|
||||
|
||||
```math
|
||||
\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
|
||||
```
|
||||
$$
|
||||
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
|
||||
$$
|
||||
|
||||
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
|
||||
|
||||
```math
|
||||
$$
|
||||
\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
|
||||
```
|
||||
$$
|
||||
|
||||
```math
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
|
||||
```
|
||||
$$
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
|
||||
$$
|
||||
|
||||
#### End-to-End (E2E) Latency
|
||||
* The typical total time from when a request is submitted until the final token of the response is received.
|
||||
|
||||
#### Total Token Throughput
|
||||
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
|
||||
```math
|
||||
\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
#### Tokens Per Second (TPS) or Output Token Throughput
|
||||
* how many output tokens the system generates each second.
|
||||
```math
|
||||
\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
### Request Time Breakdown
|
||||
|
||||
|
||||
@ -41,13 +41,13 @@ Chat API
|
||||
|
||||
You can query Chat API with any http clients, a typical example is OpenAI Python client:
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/openai_chat_client.py
|
||||
.. literalinclude:: ../../../../examples/serve/openai_chat_client.py
|
||||
:language: python
|
||||
:linenos:
|
||||
|
||||
Another example uses ``curl``:
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/curl_chat_client.sh
|
||||
.. literalinclude:: ../../../../examples/serve/curl_chat_client.sh
|
||||
:language: bash
|
||||
:linenos:
|
||||
|
||||
@ -56,13 +56,13 @@ Completions API
|
||||
|
||||
You can query Completions API with any http clients, a typical example is OpenAI Python client:
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/openai_completion_client.py
|
||||
.. literalinclude:: ../../../../examples/serve/openai_completion_client.py
|
||||
:language: python
|
||||
:linenos:
|
||||
|
||||
Another example uses ``curl``:
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/curl_completion_client.sh
|
||||
.. literalinclude:: ../../../../examples/serve/curl_completion_client.sh
|
||||
:language: bash
|
||||
:linenos:
|
||||
|
||||
@ -97,13 +97,13 @@ Multimodal Chat API
|
||||
|
||||
You can query Completions API with any http clients, a typical example is OpenAI Python client:
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/openai_completion_client_for_multimodal.py
|
||||
.. literalinclude:: ../../../../examples/serve/openai_completion_client_for_multimodal.py
|
||||
:language: python
|
||||
:linenos:
|
||||
|
||||
Another example uses ``curl``:
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/curl_chat_client_for_multimodal.sh
|
||||
.. literalinclude:: ../../../../examples/serve/curl_chat_client_for_multimodal.sh
|
||||
:language: bash
|
||||
:linenos:
|
||||
|
||||
@ -254,7 +254,23 @@ Example output:
|
||||
}
|
||||
]
|
||||
|
||||
Configuring with YAML Files
|
||||
----------------------------
|
||||
|
||||
You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--extra_llm_api_options`` option to the path of a YAML file, the arguments in the file will override the corresponding command line arguments.
|
||||
|
||||
The yaml file is configuration of `tensorrt_llm.llmapi.LlmArgs <https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs>`_, the class has multiple levels of hierarchy, to configure the top level arguments like ``max_batch_size``, the yaml file should be like:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
max_batch_size: 8
|
||||
|
||||
To configure the nested level arguments like ``moe_config.backend``, the yaml file should be like:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
moe_config:
|
||||
backend: CUTLASS
|
||||
|
||||
Syntax
|
||||
------
|
||||
|
||||
@ -47,7 +47,7 @@ docker run --rm -it \
|
||||
-p 8000:8000 \
|
||||
-v ~/.cache:/root/.cache:rw \
|
||||
--name tensorrt_llm \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4 \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
@ -250,7 +250,7 @@ Here is an example response, showing that the TensorRT LLM server returns “New
|
||||
### Troubleshooting Tips
|
||||
|
||||
* If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`.
|
||||
* For running input/output sequence lengths of 8K/1K on H200, there is a known CUDA Out-Of-Memory issue caused by the PyTorch CUDA Caching Allocator fragmenting memory. As a workaround, you can set the environment variable `PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192`. For more details, please refer to the [PyTorch documentation on optimizing memory usage](https://docs.pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf).
|
||||
* For running input/output sequence lengths of 8K/1K on H200, there is a known CUDA Out-Of-Memory issue caused by the PyTorch CUDA Caching Allocator fragmenting memory. As a workaround, you can set the environment variable `PYTORCH_ALLOC_CONF=max_split_size_mb:8192`. For more details, please refer to the [PyTorch documentation on optimizing memory usage](https://docs.pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf).
|
||||
* Ensure your model checkpoints are compatible with the expected format.
|
||||
* For performance issues, check GPU utilization with nvidia-smi while the server is running.
|
||||
* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed.
|
||||
@ -399,31 +399,33 @@ P99 E2EL (ms): [result]
|
||||
|
||||
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
|
||||
|
||||
```math
|
||||
\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
|
||||
```
|
||||
$$
|
||||
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
|
||||
$$
|
||||
|
||||
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
|
||||
|
||||
```math
|
||||
$$
|
||||
\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
|
||||
```
|
||||
$$
|
||||
|
||||
```math
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
|
||||
```
|
||||
$$
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
|
||||
$$
|
||||
|
||||
#### End-to-End (E2E) Latency
|
||||
* The typical total time from when a request is submitted until the final token of the response is received.
|
||||
|
||||
#### Total Token Throughput
|
||||
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
|
||||
```math
|
||||
\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
#### Tokens Per Second (TPS) or Output Token Throughput
|
||||
* how many output tokens the system generates each second.
|
||||
```math
|
||||
\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
@ -43,7 +43,7 @@ docker run --rm -it \
|
||||
-p 8000:8000 \
|
||||
-v ~/.cache:/root/.cache:rw \
|
||||
--name tensorrt_llm \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4 \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
@ -212,8 +212,6 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
|
||||
|
||||
Here is an example response, showing that the TensorRT LLM server reasons and answers the questions.
|
||||
|
||||
TODO: Use Chat Compeletions API / Responses API as the example after the PR is merged.
|
||||
|
||||
```json
|
||||
{"id":"chatcmpl-c5bf51b5cab94e10ba5da5266d12ee59","object":"chat.completion","created":1755815898,"model":"openai/gpt-oss-120b","choices":[{"index":0,"message":{"role":"assistant","content":"analysisThe user asks: \"Where is New York?\" Likely they want location info. Provide answer: New York State in northeastern US, New York City on the east coast, coordinates, etc. Provide context.assistantfinal**New York** can refer to two related places in the United States:\n\n| What it is | Where it is | Approx. coordinates | How to picture it |\n|------------|------------|--------------------|-------------------|\n| **New York State** | The northeastern corner of the United States, bordered by **Vermont, Massachusetts, Connecticut, New Jersey, Pennsylvania, and the Canadian provinces of Ontario and Quebec**. | 42.7° N, 75.5° W (roughly the state’s geographic centre) | A roughly rectangular state that stretches from the Atlantic Ocean in the southeast to the Adirondack Mountains and the Great Lakes region in the north. |\n| **New York City (NYC)** | The largest city in the state, located on the **southern tip of the state** where the **Hudson River meets the Atlantic Ocean**. It occupies five boroughs: Manhattan, Brooklyn, Queens, The Bronx, and Staten Island. | 40.7128° N, 74.0060° W | A dense, world‑famous metropolis that sits on a series of islands (Manhattan, Staten Island, parts of the Bronx) and the mainland (Brooklyn and Queens). |\n\n### Quick geographic context\n- **On a map of the United States:** New York State is in the **Northeast** region, just east of the Great Lakes and north of Pennsylvania. \n- **From Washington, D.C.:** Travel roughly **225 mi (360 km) northeast**. \n- **From Boston, MA:** Travel about **215 mi (350 km) southwest**. \n- **From Toronto, Canada:** Travel about **500 mi (800 km) southeast**.\n\n### Travel tips\n- **By air:** Major airports include **John F. Kennedy International (JFK)**, **LaGuardia (LGA)**, and **Newark Liberty International (EWR)** (the latter is actually in New Jersey but serves the NYC metro area). \n- **By train:** Amtrak’s **Northeast Corridor** runs from **Boston → New York City → Washington, D.C.** \n- **By car:** Interstates **I‑87** (north‑south) and **I‑90** (east‑west) are the primary highways crossing the state.\n\n### Fun fact\n- The name “**New York**” was given by the English in 1664, honoring the Duke of York (later King James II). The city’s original Dutch name was **“New Amsterdam.”**\n\nIf you need more specific directions (e.g., how to get to a particular neighborhood, landmark, or the state capital **Albany**), just let me know!","reasoning_content":null,"tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null,"mm_embedding_handle":null,"disaggregated_params":null,"avg_decoded_tokens_per_iter":1.0}],"usage":{"prompt_tokens":72,"total_tokens":705,"completion_tokens":633},"prompt_token_ids":null}
|
||||
```
|
||||
@ -349,31 +347,33 @@ P99 E2EL (ms): [result]
|
||||
|
||||
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
|
||||
|
||||
```math
|
||||
\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
|
||||
```
|
||||
$$
|
||||
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
|
||||
$$
|
||||
|
||||
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
|
||||
|
||||
```math
|
||||
$$
|
||||
\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
|
||||
```
|
||||
$$
|
||||
|
||||
```math
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
|
||||
```
|
||||
$$
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
|
||||
$$
|
||||
|
||||
#### End-to-End (E2E) Latency
|
||||
* The typical total time from when a request is submitted until the final token of the response is received.
|
||||
|
||||
#### Total Token Throughput
|
||||
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
|
||||
```math
|
||||
\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
#### Tokens Per Second (TPS) or Output Token Throughput
|
||||
* how many output tokens the system generates each second.
|
||||
```math
|
||||
\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
@ -0,0 +1,308 @@
|
||||
# Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell
|
||||
|
||||
## Introduction
|
||||
|
||||
This is a quickstart guide for running the Kimi K2 Thinking model on TensorRT LLM. It focuses on a working setup with recommended defaults.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
* GPU: NVIDIA Blackwell Architecture
|
||||
* OS: Linux
|
||||
* Drivers: CUDA Driver 575 or Later
|
||||
* Docker with NVIDIA Container Toolkit installed
|
||||
* Python3 and python3-pip (Optional, for accuracy evaluation only)
|
||||
|
||||
## Models
|
||||
|
||||
* NVFP4 model: [Kimi-K2-Thinking-NVFP4](https://huggingface.co/nvidia/Kimi-K2-Thinking-NVFP4)
|
||||
|
||||
|
||||
## Deploy Kimi K2 Thinking on DGX B200 through Docker
|
||||
|
||||
### Prepare Docker image
|
||||
|
||||
Build and run the docker container. See the [Docker guide](../../../docker/README.md) for details.
|
||||
```bash
|
||||
cd TensorRT-LLM
|
||||
|
||||
make -C docker release_build IMAGE_TAG=kimi-k2-thinking-local
|
||||
|
||||
make -C docker release_run IMAGE_NAME=tensorrt_llm IMAGE_TAG=kimi-k2-thinking-local LOCAL_USER=1
|
||||
```
|
||||
|
||||
### Launch the TensorRT LLM Server
|
||||
|
||||
Prepare an `EXTRA_OPTIONS_YAML_FILE` that specifies LLM API arguments when deploying the model. An example YAML file is as follows:
|
||||
|
||||
```yaml
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 8448
|
||||
max_seq_len: 8212
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
backend: UCX
|
||||
max_tokens_in_buffer: 8448
|
||||
trust_remote_code: true
|
||||
```
|
||||
|
||||
This YAML file specifies configurations that deploy the model with 8-way expert parallelism for the MoE part and 8-way attention data parallelism. It also enables `trust_remote_code`, so that it works with the Kimi K2 Thinking customized [tokenizer](https://huggingface.co/nvidia/Kimi-K2-Thinking-NVFP4/blob/main/tokenization_kimi.py).
|
||||
|
||||
|
||||
With the `EXTRA_OPTIONS_YAML_FILE`, use the following example command to launch the TensorRT LLM server with the Kimi-K2-Thinking-NVFP4 model from within the container.
|
||||
|
||||
```bash
|
||||
trtllm-serve nvidia/Kimi-K2-Thinking-NVFP4 \
|
||||
--host 0.0.0.0 --port 8000 \
|
||||
--extra_llm_api_options ${EXTRA_OPTIONS_YAML_FILE}
|
||||
```
|
||||
|
||||
TensorRT LLM will load weights and select the best kernels during startup. The server is successfully launched when the following log is shown:
|
||||
|
||||
```log
|
||||
INFO: Started server process [xxxxx]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://localhost:8000 (Press CTRL+C to quit)
|
||||
```
|
||||
|
||||
You can query the health/readiness of the server using:
|
||||
|
||||
```shell
|
||||
curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
|
||||
```
|
||||
|
||||
When the `Status: 200` code is returned, the server is ready for queries.
|
||||
|
||||
## Deploy Kimi K2 Thinking on GB200 NVL72 through SLURM with wide EP and disaggregated serving
|
||||
|
||||
TensorRT LLM provides a set of SLURM scripts that can be easily configured through YAML files and automatically launch SLURM jobs on GB200 NVL72 clusters for deployment, benchmarking, and accuracy testing purposes. The scripts are located at `examples/disaggregated/slurm/benchmark`. Refer to [this page](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/slurm_scripts) for more details and example wide EP config files.
|
||||
|
||||
For Kimi K2 Thinking, an example configuration for SLURM arguments and the scripts is as follows:
|
||||
|
||||
```yaml
|
||||
# SLURM Configuration
|
||||
slurm:
|
||||
script_file: "disaggr_torch.slurm"
|
||||
partition: "<partition>"
|
||||
account: "<account>"
|
||||
job_time: "02:00:00"
|
||||
job_name: "<job_name>"
|
||||
extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
|
||||
numa_bind: true # Only enable for GB200 NVL72
|
||||
|
||||
# Benchmark Mode
|
||||
benchmark:
|
||||
mode: "e2e" # Options: e2e, gen_only
|
||||
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
|
||||
multi_round: 8 # Number of benchmark rounds
|
||||
benchmark_ratio: 0.8 # Benchmark ratio
|
||||
streaming: true # Enable streaming mode
|
||||
concurrency_list: "16"
|
||||
input_length: 1024 # Input sequence length
|
||||
output_length: 1024 # Output sequence length
|
||||
dataset_file: "<dataset_file>"
|
||||
|
||||
# Hardware Configuration
|
||||
hardware:
|
||||
gpus_per_node: 4 # Modify this with your hardware configuration
|
||||
num_ctx_servers: 4 # Number of context servers
|
||||
num_gen_servers: 1 # Number of generation servers
|
||||
|
||||
# Environment Configuration
|
||||
environment:
|
||||
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
|
||||
container_image: "<container_image>"
|
||||
model_path: "<model_path>"
|
||||
trtllm_repo: "<trtllm_repo>"
|
||||
build_wheel: false # Don't build the wheel when launching multiple jobs
|
||||
trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
|
||||
work_dir: "<full_path_to_work_dir>"
|
||||
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||
|
||||
# Worker Configuration
|
||||
worker_config:
|
||||
gen:
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
enable_lm_head_tp_in_adp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 128
|
||||
max_seq_len: 9236
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 512
|
||||
- 768
|
||||
- 1024
|
||||
- 2048
|
||||
print_iter_log: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 416
|
||||
layer_updates_per_iter: 1
|
||||
cache_transceiver_config:
|
||||
backend: UCX
|
||||
max_tokens_in_buffer: 8448
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
trust_remote_code: true
|
||||
ctx:
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 8448
|
||||
max_seq_len: 8212
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
cache_transceiver_config:
|
||||
backend: UCX
|
||||
max_tokens_in_buffer: 8448
|
||||
trust_remote_code: true
|
||||
```
|
||||
|
||||
It includes SLURM-specific configurations, benchmark and hardware details, and environment settings. The `worker_config` field includes detailed settings for context and generation servers when deploying a disaggregated server, with each specified as a list of LLM API arguments.
|
||||
|
||||
To launch SLURM jobs with the YAML config file, execute the following command:
|
||||
```shell
|
||||
cd <TensorRT LLM root>/examples/disaggregated/slurm/benchmark
|
||||
python3 submit.py -c config.yaml
|
||||
```
|
||||
|
||||
## Query the OpenAI-compatible API Endpoint
|
||||
|
||||
After the TensorRT LLM server is set up and shows `Application startup complete`, you can send requests to the server.
|
||||
|
||||
```shell
|
||||
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "nvidia/Kimi-K2-Thinking-NVFP4",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Where is New York?"
|
||||
}
|
||||
],
|
||||
"max_tokens": 128,
|
||||
"top_p": 1.0
|
||||
}' -w "\n"
|
||||
```
|
||||
|
||||
Example response:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "chatcmpl-5907ed752eb44d11a12893b19f79f8ca",
|
||||
"object": "chat.completion",
|
||||
"created": 1764866686,
|
||||
"model": "nvidia/Kimi-K2-Thinking-NVFP4",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "<think> The user is asking a very simple question: \"Where is New York?\" This could be interpreted in a few ways:\n\n1. Where is New York State located?\n2. Where is New York City located?\n3. Where is New York located in relation to something else?\n\nGiven the ambiguity, I should provide a comprehensive answer that covers the main interpretations. I should be clear and direct.\n\nLet me structure my answer:\n- First, clarify that \"New York\" can refer to either New York State or New York City\n- For New York State: It's located in the northeastern United States, bordered by New Jersey, Pennsylvania, Connecticut",
|
||||
"reasoning_content": "",
|
||||
"reasoning": null,
|
||||
"tool_calls": []
|
||||
},
|
||||
"logprobs": null,
|
||||
"finish_reason": "length",
|
||||
"stop_reason": null,
|
||||
"mm_embedding_handle": null,
|
||||
"disaggregated_params": null,
|
||||
"avg_decoded_tokens_per_iter": 1.0
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 12,
|
||||
"total_tokens": 140,
|
||||
"completion_tokens": 128,
|
||||
"prompt_tokens_details": {
|
||||
"cached_tokens": 0
|
||||
}
|
||||
},
|
||||
"prompt_token_ids": null
|
||||
}
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
|
||||
To benchmark the performance of your TensorRT LLM server, you can leverage the built-in `benchmark_serving.py` script. To do this, first create a wrapper `bench.sh` script.
|
||||
|
||||
```shell
|
||||
cat <<'EOF' > bench.sh
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
concurrency_list="1 2 4 8 16 32 64 128 256"
|
||||
multi_round=5
|
||||
isl=1024
|
||||
osl=1024
|
||||
result_dir=/tmp/kimi_k2_thinking_output
|
||||
|
||||
for concurrency in ${concurrency_list}; do
|
||||
num_prompts=$((concurrency * multi_round))
|
||||
python -m tensorrt_llm.serve.scripts.benchmark_serving \
|
||||
--model nvidia/Kimi-K2-Thinking-NVFP4 \
|
||||
--backend openai \
|
||||
--dataset-name "random" \
|
||||
--random-input-len ${isl} \
|
||||
--random-output-len ${osl} \
|
||||
--random-prefix-len 0 \
|
||||
--random-ids \
|
||||
--num-prompts ${num_prompts} \
|
||||
--max-concurrency ${concurrency} \
|
||||
--ignore-eos \
|
||||
--tokenize-on-client \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el"
|
||||
done
|
||||
EOF
|
||||
chmod +x bench.sh
|
||||
```
|
||||
|
||||
If you want to save the results to a file, add the following options:
|
||||
|
||||
```shell
|
||||
--save-result \
|
||||
--result-dir "${result_dir}" \
|
||||
--result-filename "concurrency_${concurrency}.json"
|
||||
```
|
||||
|
||||
For more benchmarking options, see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py).
|
||||
|
||||
Run `bench.sh` to begin a serving benchmark.
|
||||
|
||||
```shell
|
||||
./bench.sh
|
||||
```
|
||||
@ -39,7 +39,7 @@ docker run --rm -it \
|
||||
-p 8000:8000 \
|
||||
-v ~/.cache:/root/.cache:rw \
|
||||
--name tensorrt_llm \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4 \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
@ -354,31 +354,33 @@ P99 E2EL (ms): [result]
|
||||
|
||||
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
|
||||
|
||||
```math
|
||||
\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
|
||||
```
|
||||
$$
|
||||
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
|
||||
$$
|
||||
|
||||
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
|
||||
|
||||
```math
|
||||
$$
|
||||
\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
|
||||
```
|
||||
$$
|
||||
|
||||
```math
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
|
||||
```
|
||||
$$
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
|
||||
$$
|
||||
|
||||
#### End-to-End (E2E) Latency
|
||||
* The typical total time from when a request is submitted until the final token of the response is received.
|
||||
|
||||
#### Total Token Throughput
|
||||
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
|
||||
```math
|
||||
\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
#### Tokens Per Second (TPS) or Output Token Throughput
|
||||
* how many output tokens the system generates each second.
|
||||
```math
|
||||
\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
@ -38,7 +38,7 @@ docker run --rm -it \
|
||||
-p 8000:8000 \
|
||||
-v ~/.cache:/root/.cache:rw \
|
||||
--name tensorrt_llm \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4 \
|
||||
nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc5 \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
@ -346,31 +346,33 @@ P99 E2EL (ms): [result]
|
||||
|
||||
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
|
||||
|
||||
```math
|
||||
\text{TPOT (1\ request)} = \text{Avg(ITL)} = \frac{\text{E2E\ latency} - \text{TTFT}}{\text{\#Output\ Tokens} - 1}
|
||||
```
|
||||
$$
|
||||
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
|
||||
$$
|
||||
|
||||
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
|
||||
|
||||
```math
|
||||
$$
|
||||
\text{Avg TPOT (N requests)} = \frac{\text{TPOT}_1 + \text{TPOT}_2 + \cdots + \text{TPOT}_N}{N}
|
||||
```
|
||||
$$
|
||||
|
||||
```math
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{\#Output Tokens across requests}}
|
||||
```
|
||||
$$
|
||||
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
|
||||
$$
|
||||
|
||||
#### End-to-End (E2E) Latency
|
||||
* The typical total time from when a request is submitted until the final token of the response is received.
|
||||
|
||||
#### Total Token Throughput
|
||||
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
|
||||
```math
|
||||
\text{Total\ TPS} = \frac{\text{\#Input\ Tokens}+\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
#### Tokens Per Second (TPS) or Output Token Throughput
|
||||
* how many output tokens the system generates each second.
|
||||
```math
|
||||
\text{TPS} = \frac{\text{\#Output\ Tokens}}{T_{last} - T_{first}}
|
||||
```
|
||||
|
||||
$$
|
||||
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
|
||||
$$
|
||||
|
||||
@ -0,0 +1,256 @@
|
||||
# Deployment Guide for Qwen3 on TensorRT LLM - Blackwell & Hopper Hardware
|
||||
|
||||
## Introduction
|
||||
|
||||
This is a functional quick-start guide for running the Qwen3 model on TensorRT LLM. It focuses on a working setup with recommended defaults. Additional performance optimizations and support will be rolled out in future updates.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
* GPU: NVIDIA Blackwell or Hopper Architecture
|
||||
* OS: Linux
|
||||
* Drivers: CUDA Driver 575 or Later
|
||||
* Docker with NVIDIA Container Toolkit installed
|
||||
* Python3 and python3-pip (Optional, for accuracy evaluation only)
|
||||
|
||||
## Models
|
||||
|
||||
* [Qwen3-30B-A3B](https://huggingface.co/Qwen/Qwen3-30B-A3B)
|
||||
* [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B)
|
||||
* [Qwen3-235B-A22B-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8)
|
||||
* [Qwen3-30B-A3B-NVFP4](https://huggingface.co/nvidia/Qwen3-30B-A3B-NVFP4)
|
||||
* [Qwen3-235B-A22B-NVFP4](https://huggingface.co/nvidia/Qwen3-235B-A22B-NVFP4)
|
||||
|
||||
## Deployment Steps
|
||||
|
||||
### Run Docker Container
|
||||
|
||||
Build and run the docker container. See the [Docker guide](../../../docker/README.md) for details.
|
||||
|
||||
```shell
|
||||
cd TensorRT-LLM
|
||||
|
||||
make -C docker release_build IMAGE_TAG=qwen3-local
|
||||
|
||||
make -C docker release_run IMAGE_NAME=tensorrt_llm IMAGE_TAG=qwen3-local LOCAL_USER=1
|
||||
```
|
||||
|
||||
### Recommended Performance Settings
|
||||
|
||||
We maintain YAML configuration files with recommended performance settings in the [`examples/configs`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/configs) directory. These config files are present in the TensorRT LLM container at the path `/app/tensorrt_llm/examples/configs`. You can use these out-of-the-box, or adjust them to your specific use case.
|
||||
|
||||
```shell
|
||||
TRTLLM_DIR=/app/tensorrt_llm # change as needed to match your environment
|
||||
EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/qwen3.yaml
|
||||
```
|
||||
|
||||
Note: if you don't have access to the source code locally, you can manually create the YAML config file using the code in the dropdown below.
|
||||
|
||||
````{admonition} Show code
|
||||
:class: dropdown
|
||||
|
||||
```{literalinclude} ../../../examples/configs/qwen3.yaml
|
||||
---
|
||||
language: shell
|
||||
prepend: |
|
||||
EXTRA_LLM_API_FILE=/tmp/config.yml
|
||||
|
||||
cat << EOF > ${EXTRA_LLM_API_FILE}
|
||||
append: EOF
|
||||
---
|
||||
```
|
||||
````
|
||||
|
||||
|
||||
### Launch the TensorRT LLM Server
|
||||
|
||||
Below is an example command to launch the TensorRT LLM server with the Qwen3 model from within the container.
|
||||
|
||||
```shell
|
||||
trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
|
||||
```
|
||||
|
||||
After the server is set up, the client can now send prompt requests to the server and receive results.
|
||||
|
||||
### LLM API Options (YAML Configuration)
|
||||
|
||||
<!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
|
||||
|
||||
These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
|
||||
|
||||
#### `tensor_parallel_size`
|
||||
|
||||
* **Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance.
|
||||
|
||||
#### `moe_expert_parallel_size`
|
||||
|
||||
* **Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tensor_parallel_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models.
|
||||
|
||||
#### `kv_cache_free_gpu_memory_fraction`
|
||||
|
||||
* **Description:** A value between `0.0` and `1.0` that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
|
||||
* **Recommendation:** If you experience OOM errors, try reducing this value to `0.7` or lower.
|
||||
|
||||
|
||||
#### `max_batch_size`
|
||||
|
||||
* **Description:** The maximum number of user requests that can be grouped into a single batch for processing. The actual max batch size that can be achieved depends on total sequence length (input + output).
|
||||
|
||||
#### `max_num_tokens`
|
||||
|
||||
* **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch.
|
||||
|
||||
#### `max_seq_len`
|
||||
|
||||
* **Description:** The maximum possible sequence length for a single request, including both input and generated output tokens. We won't specifically set it. It will be inferred from model config.
|
||||
|
||||
#### `trust_remote_code`
|
||||
* **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
|
||||
|
||||
#### `cuda_graph_config`
|
||||
|
||||
* **Description**: A section for configuring CUDA graphs to optimize performance.
|
||||
|
||||
* **Options**:
|
||||
|
||||
* `enable_padding`: If `true`, input batches are padded to the nearest `cuda_graph_batch_size`. This can significantly improve performance.
|
||||
|
||||
**Default**: `false`
|
||||
|
||||
* `batch_sizes`: List of batch sizes for which CUDA graphs will be pre-captured.
|
||||
|
||||
**Recommendation**: Set this to cover the range of batch sizes you expect in production.
|
||||
|
||||
#### `moe_config`
|
||||
|
||||
* **Description**: Configuration for Mixture-of-Experts (MoE) models.
|
||||
|
||||
* **Options**:
|
||||
|
||||
* `backend`: The backend to use for MoE operations.
|
||||
|
||||
**Default**: `CUTLASS`
|
||||
|
||||
See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
|
||||
|
||||
## Testing API Endpoint
|
||||
|
||||
### Basic Test
|
||||
|
||||
Start a new terminal on the host to test the TensorRT LLM server you just launched.
|
||||
|
||||
You can query the health/readiness of the server using:
|
||||
|
||||
```shell
|
||||
curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
|
||||
```
|
||||
|
||||
When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
|
||||
|
||||
After the TensorRT LLM server is set up and shows Application startup complete, you can send requests to the server.
|
||||
|
||||
```shell
|
||||
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "Qwen/Qwen3-30B-A3B",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the capital of France?"
|
||||
}
|
||||
],
|
||||
"max_tokens": 512,
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.95
|
||||
}' -w "\n"
|
||||
```
|
||||
|
||||
Here is an example response:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "chatcmpl-abc123def456",
|
||||
"object": "chat.completion",
|
||||
"created": 1759022940,
|
||||
"model": "Qwen/Qwen3-30B-A3B",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "The capital of France is Paris. Paris is not only the capital but also the largest city in France, known for its rich history, culture, art, and iconic landmarks such as the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral."
|
||||
},
|
||||
"logprobs": null,
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 15,
|
||||
"completion_tokens": 58,
|
||||
"total_tokens": 73
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Troubleshooting Tips
|
||||
|
||||
* If you encounter CUDA out-of-memory errors, try reducing `max_batch_size`, `max_num_tokens`, or `kv_cache_free_gpu_memory_fraction`.
|
||||
* Ensure your model checkpoints are compatible with the expected format.
|
||||
* For performance issues, check GPU utilization with `nvidia-smi` while the server is running.
|
||||
* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed.
|
||||
* For connection issues, make sure the server port (`8000` in this guide) is not being used by another application.
|
||||
* For MoE models (Qwen3-30B-A3B, Qwen3-235B-A22B), ensure `moe_expert_parallel_size` is properly configured.
|
||||
|
||||
## Benchmarking Performance
|
||||
|
||||
To benchmark the performance of your TensorRT LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first create a wrapper `bench.sh` script.
|
||||
|
||||
```shell
|
||||
cat <<'EOF' > bench.sh
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Adjust the model name based on which Qwen3 model you're benchmarking
|
||||
MODEL_NAME="Qwen/Qwen3-30B-A3B"
|
||||
|
||||
concurrency_list="1 2 4 8 16 32 64 128"
|
||||
multi_round=5
|
||||
isl=1024
|
||||
osl=1024
|
||||
result_dir=/tmp/qwen3_output
|
||||
|
||||
for concurrency in ${concurrency_list}; do
|
||||
num_prompts=$((concurrency * multi_round))
|
||||
python -m tensorrt_llm.serve.scripts.benchmark_serving \
|
||||
--model ${MODEL_NAME} \
|
||||
--backend openai \
|
||||
--dataset-name "random" \
|
||||
--random-input-len ${isl} \
|
||||
--random-output-len ${osl} \
|
||||
--random-prefix-len 0 \
|
||||
--random-ids \
|
||||
--num-prompts ${num_prompts} \
|
||||
--max-concurrency ${concurrency} \
|
||||
--ignore-eos \
|
||||
--tokenize-on-client \
|
||||
--percentile-metrics "ttft,tpot,itl,e2el"
|
||||
done
|
||||
EOF
|
||||
chmod +x bench.sh
|
||||
```
|
||||
|
||||
To achieve max through-put, with attention DP on, one needs to sweep up to `concurrency = max_batch_size * num_gpus`.
|
||||
|
||||
If you want to save the results to a file add the following options.
|
||||
|
||||
```shell
|
||||
--save-result \
|
||||
--result-dir "${result_dir}" \
|
||||
--result-filename "concurrency_${concurrency}.json"
|
||||
```
|
||||
|
||||
For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
|
||||
|
||||
Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
|
||||
|
||||
```shell
|
||||
./bench.sh
|
||||
```
|
||||
@ -91,4 +91,6 @@ The deployment guides below provide more detailed instructions for serving speci
|
||||
deployment-guide-for-llama3.3-70b-on-trtllm.md
|
||||
deployment-guide-for-llama4-scout-on-trtllm.md
|
||||
deployment-guide-for-gpt-oss-on-trtllm.md
|
||||
deployment-guide-for-qwen3-on-trtllm.md
|
||||
deployment-guide-for-qwen3-next-on-trtllm.md
|
||||
deployment-guide-for-kimi-k2-thinking-on-trtllm.md
|
||||
|
||||
@ -2,7 +2,7 @@ Curl Chat Client
|
||||
================
|
||||
Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/serve/curl_chat_client.sh.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_chat_client.sh.
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/curl_chat_client.sh
|
||||
:lines: 1-11
|
||||
|
||||
@ -2,7 +2,7 @@ Curl Chat Client For Multimodal
|
||||
===============================
|
||||
Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/serve/curl_chat_client_for_multimodal.sh.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_chat_client_for_multimodal.sh.
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/curl_chat_client_for_multimodal.sh
|
||||
:lines: 1-88
|
||||
|
||||
@ -2,7 +2,7 @@ Curl Completion Client
|
||||
======================
|
||||
Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/serve/curl_completion_client.sh.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/curl_completion_client.sh.
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/curl_completion_client.sh
|
||||
:lines: 1-10
|
||||
|
||||
@ -2,9 +2,9 @@ Deepseek R1 Reasoning Parser
|
||||
============================
|
||||
Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/serve/deepseek_r1_reasoning_parser.sh.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/deepseek_r1_reasoning_parser.sh.
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/deepseek_r1_reasoning_parser.sh
|
||||
:lines: 1-10
|
||||
:lines: 1-23
|
||||
:language: bash
|
||||
:linenos:
|
||||
|
||||
@ -2,7 +2,7 @@ Genai Perf Client
|
||||
=================
|
||||
Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/serve/genai_perf_client.sh.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/genai_perf_client.sh.
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/genai_perf_client.sh
|
||||
:lines: 1-16
|
||||
|
||||
@ -2,7 +2,7 @@ Genai Perf Client For Multimodal
|
||||
================================
|
||||
Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/serve/genai_perf_client_for_multimodal.sh.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/serve/genai_perf_client_for_multimodal.sh.
|
||||
|
||||
.. literalinclude:: ../../../examples/serve/genai_perf_client_for_multimodal.sh
|
||||
:lines: 1-19
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Generate text with guided decoding
|
||||
==================================
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/llm-api/llm_guided_decoding.py.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_guided_decoding.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_guided_decoding.py
|
||||
:lines: 4-47
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Generate text
|
||||
=============
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/a761585d9c15b4c1249aaf65a8f90764efa83a3c/examples/llm-api/llm_inference.py.
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/blob/e4c707845ff58fcc0b1d87afb4dd0e64885c780a/examples/llm-api/llm_inference.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_inference.py
|
||||
:lines: 4-35
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user