mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-22 19:52:38 +08:00
Update latest GitHub pages to v1.0.0rc3
This commit is contained in:
parent
55e975f97e
commit
7b17110c12
@ -1,4 +1,4 @@
|
||||
# Sphinx build info version 1
|
||||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
|
||||
config: cb3cbe8a473ef8fd1cf27e6890eb63f4
|
||||
config: ee79abf721be5d1b28815a3912832a13
|
||||
tags: 645f666f9bcd5a90fca523b33c5a78b7
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -11,7 +11,8 @@ from tensorrt_llm.mapping import Mapping
|
||||
|
||||
from ..attention_backend import (AttentionInputType, AttentionMetadata,
|
||||
TrtllmAttention, TrtllmAttentionMetadata)
|
||||
from ..attention_backend.interface import (PositionalEmbeddingParams,
|
||||
from ..attention_backend.interface import (AttentionMask,
|
||||
PositionalEmbeddingParams,
|
||||
PredefinedAttentionMask)
|
||||
from ..attention_backend.utils import create_attention, get_attention_backend
|
||||
from ..distributed import AllReduceParams
|
||||
@ -67,8 +68,9 @@ class Attention(nn.Module):
|
||||
config = config or ModelConfig()
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_attention_heads
|
||||
self.head_dim = getattr(config.pretrained_config, "head_dim",
|
||||
self.hidden_size // self.num_heads)
|
||||
self.head_dim = getattr(config.pretrained_config, 'head_dim', None)
|
||||
if not isinstance(self.head_dim, int):
|
||||
self.head_dim = self.hidden_size // self.num_heads
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
@ -225,12 +227,12 @@ class Attention(nn.Module):
|
||||
position_ids: Optional[torch.IntTensor],
|
||||
hidden_states: Union[torch.Tensor, Fp4QuantizedTensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.
|
||||
CAUSAL,
|
||||
attention_mask: AttentionMask = PredefinedAttentionMask.CAUSAL,
|
||||
mrope_config: Optional[dict] = None,
|
||||
all_reduce_params: Optional[AllReduceParams] = None,
|
||||
lora_params: Optional[dict] = None,
|
||||
attention_window_size: Optional[int] = None,
|
||||
attention_mask_data: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
@ -240,12 +242,12 @@ class Attention(nn.Module):
|
||||
position_ids (Optional[torch.IntTensor]): The position IDs.
|
||||
hidden_states (torch.Tensor): The hidden states.
|
||||
attn_metadata (AttentionMetadata): The attention metadata.
|
||||
attention_mask (PredefinedAttentionMask): The attention mask type.
|
||||
attention_mask (AttentionMask): The attention mask type.
|
||||
mrope_config (Optional[dict]): The MROPE configuration.
|
||||
all_reduce_params (Optional[AllReduceParams]): The all reduce parameters.
|
||||
lora_params (Optional[dict]): The LoRA parameters.
|
||||
attention_window_size (Optional[int]): The attention window size.
|
||||
|
||||
attention_mask_data (Optional[torch.Tensor]): The attention mask data.
|
||||
Returns:
|
||||
torch.Tensor: The output tensor.
|
||||
"""
|
||||
@ -268,7 +270,7 @@ class Attention(nn.Module):
|
||||
|
||||
out_scale = None
|
||||
out_scale_sf = None
|
||||
if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales:
|
||||
if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales or self.o_proj.has_fp8_rowwise:
|
||||
out_scale = self.o_proj.inv_input_scale
|
||||
if self.o_proj.has_nvfp4 and self.support_nvfp4_output:
|
||||
out_scale_sf = self.o_proj.input_scale
|
||||
@ -283,7 +285,8 @@ class Attention(nn.Module):
|
||||
out_scale_sf=out_scale_sf,
|
||||
attention_mask=attention_mask,
|
||||
mrope_config=mrope_config,
|
||||
attention_window_size=attention_window_size)
|
||||
attention_window_size=attention_window_size,
|
||||
attention_mask_data=attention_mask_data)
|
||||
hidden_states = attn_output
|
||||
attn_output = self.o_proj(attn_output,
|
||||
all_reduce_params=all_reduce_params,
|
||||
@ -356,7 +359,7 @@ def fp8_block_scaling_bmm_out(
|
||||
out: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
sm_version = get_sm_version()
|
||||
if sm_version == 90:
|
||||
if sm_version == 90 or sm_version == 89:
|
||||
mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
|
||||
mat1)
|
||||
torch.ops.trtllm.fp8_block_scaling_bmm_out(mat1_fp8, mat2_fp8,
|
||||
|
||||
@ -21,7 +21,7 @@ import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from ..._utils import pad_vocab_size
|
||||
from ...functional import Tensor, recv, send
|
||||
from ...functional import LayerNormType, Tensor, recv, send
|
||||
from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear,
|
||||
Embedding, GatedMLP, RmsNorm, SharedMoE)
|
||||
from ...layers.moe import MOEWeightWrapper
|
||||
@ -56,6 +56,9 @@ class QWenDecoderLayer(Module):
|
||||
|
||||
layers_range = config.mapping.pp_layers(config.num_hidden_layers)
|
||||
local_layer_idx = layer_idx - layers_range[0]
|
||||
# Qwen3: Enable qk_layernorm for Q/K normalization (similar to Gemma3)
|
||||
qk_layernorm = config.qwen_type in ('qwen3', 'qwen3_moe')
|
||||
|
||||
self.attention = Attention(
|
||||
local_layer_idx=local_layer_idx,
|
||||
hidden_size=config.hidden_size,
|
||||
@ -78,7 +81,11 @@ class QWenDecoderLayer(Module):
|
||||
cp_group=config.mapping.cp_group,
|
||||
quant_mode=config.quant_mode,
|
||||
use_logn_scaling=config.use_logn_attn,
|
||||
dense_bias=False)
|
||||
dense_bias=False,
|
||||
# Qwen3: Add Q/K layer normalization
|
||||
qk_layernorm=qk_layernorm,
|
||||
layernorm_type=LayerNormType.RmsNorm
|
||||
if qk_layernorm else LayerNormType.LayerNorm)
|
||||
|
||||
if config.moe.has_moe():
|
||||
mlp_kwargs = {'moe_config': config.moe, 'mapping': config.mapping}
|
||||
@ -353,6 +360,11 @@ class QWenForCausalLM(DecoderModelForCausalLM):
|
||||
"transformer": "language_model.model",
|
||||
"lm_head": "language_model.lm_head",
|
||||
}
|
||||
elif config.qwen_type in ("qwen3", "qwen3_moe"):
|
||||
custom_dict = {
|
||||
"q_layernorm": "q_norm",
|
||||
"k_layernorm": "k_norm",
|
||||
}
|
||||
loader = ModelWeightsLoader(hf_model_dir, custom_dict)
|
||||
model = cls(config)
|
||||
if config.qwen_type == "qwen" and model.config.mapping.has_tp():
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import copy
|
||||
import functools
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
@ -222,7 +223,8 @@ class _ModelFormatKind(Enum):
|
||||
|
||||
class DecodingBaseConfig(BaseModel):
|
||||
max_draft_len: Optional[int] = None
|
||||
speculative_model: Optional[Union[str, Path]] = None
|
||||
speculative_model_dir: Optional[Union[str, Path]] = None
|
||||
num_extra_kv_tokens: int = 0
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict):
|
||||
@ -235,6 +237,7 @@ class DecodingBaseConfig(BaseModel):
|
||||
"Lookahead": LookaheadDecodingConfig,
|
||||
"NGram": NGramDecodingConfig,
|
||||
"DraftTarget": DraftTargetDecodingConfig,
|
||||
"UserProvided": UserProvidedDecodingConfig,
|
||||
}
|
||||
|
||||
config_class = config_classes.get(decoding_type)
|
||||
@ -246,6 +249,35 @@ class DecodingBaseConfig(BaseModel):
|
||||
def _check_fields(self):
|
||||
pass
|
||||
|
||||
def supports_backend(self, backend: str) -> bool:
|
||||
"""
|
||||
Override if the speculation algorithm does not support
|
||||
a subset of the possible backends.
|
||||
"""
|
||||
return True
|
||||
|
||||
def validate(self) -> None:
|
||||
"""
|
||||
Do any additional error checking here.
|
||||
"""
|
||||
|
||||
@functools.cached_property
|
||||
def spec_dec_mode(self):
|
||||
# spec_dec_mode has more functionality than the raw decoding_mode string.
|
||||
# Use an alias for the import here to avoid name collisions with the one for the
|
||||
# TRT backend.
|
||||
from tensorrt_llm._torch.speculative.interface import \
|
||||
SpeculativeDecodingMode as TorchSpeculativeDecodingMode
|
||||
return TorchSpeculativeDecodingMode.from_string(
|
||||
self.decoding_type.upper())
|
||||
|
||||
def update_from_model_config(self, model_config):
|
||||
pass
|
||||
|
||||
def get_draft_model_prompt(self,
|
||||
input_tokens: torch.Tensor) -> torch.Tensor:
|
||||
return input_tokens
|
||||
|
||||
|
||||
class MedusaDecodingConfig(DecodingBaseConfig):
|
||||
medusa_choices: Optional[List[List[int]]] = None
|
||||
@ -257,6 +289,9 @@ class MedusaDecodingConfig(DecodingBaseConfig):
|
||||
|
||||
decoding_type: ClassVar[str] = "Medusa"
|
||||
|
||||
def supports_backend(self, backend: str) -> bool:
|
||||
return backend not in ("pytorch", "_autodeploy")
|
||||
|
||||
|
||||
class EagleDecodingConfig(DecodingBaseConfig):
|
||||
eagle_choices: Optional[List[List[int]]] = None
|
||||
@ -266,7 +301,6 @@ class EagleDecodingConfig(DecodingBaseConfig):
|
||||
dynamic_tree_max_topK: Optional[int] = None
|
||||
num_eagle_layers: Optional[int] = None
|
||||
max_non_leaves_per_layer: Optional[int] = None
|
||||
pytorch_weights_path: Optional[str] = None
|
||||
eagle3_one_model: Optional[bool] = True
|
||||
|
||||
@classmethod
|
||||
@ -275,13 +309,43 @@ class EagleDecodingConfig(DecodingBaseConfig):
|
||||
|
||||
decoding_type: ClassVar[str] = "Eagle"
|
||||
|
||||
def validate(self) -> None:
|
||||
if self.speculative_model_dir is None:
|
||||
raise ValueError("Draft model must be provided for EAGLE")
|
||||
|
||||
@functools.cached_property
|
||||
def spec_dec_mode(self):
|
||||
from tensorrt_llm._torch.speculative.interface import \
|
||||
SpeculativeDecodingMode as TorchSpeculativeDecodingMode
|
||||
if self.eagle3_one_model:
|
||||
return TorchSpeculativeDecodingMode.EAGLE3_ONE_MODEL
|
||||
return TorchSpeculativeDecodingMode.EAGLE3
|
||||
|
||||
def get_draft_model_prompt(self,
|
||||
input_tokens: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Eagle3 always throws away the first token when processing draft inputs
|
||||
"""
|
||||
return input_tokens[1:]
|
||||
|
||||
|
||||
class UserProvidedDecodingConfig(DecodingBaseConfig):
|
||||
# Type should be Drafter, but it leads to circular import
|
||||
drafter: object
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict):
|
||||
return cls(**data)
|
||||
|
||||
decoding_type: ClassVar[str] = "User_Provided"
|
||||
|
||||
|
||||
class NGramDecodingConfig(DecodingBaseConfig):
|
||||
"""
|
||||
Configuration for NGram drafter speculative decoding.
|
||||
|
||||
Arguments:
|
||||
prompt_lookup_num_tokens: int
|
||||
max_draft_len: int
|
||||
The length maximum of draft tokens (can be understood as length maximum of output draft tokens).
|
||||
|
||||
max_matching_ngram_size: int
|
||||
@ -297,7 +361,6 @@ class NGramDecodingConfig(DecodingBaseConfig):
|
||||
Whether to use a common pool for all requests, or the pool is private for each request if False.
|
||||
"""
|
||||
|
||||
prompt_lookup_num_tokens: int = 2
|
||||
max_matching_ngram_size: int = 4
|
||||
is_keep_all: bool = True
|
||||
is_use_oldest: bool = True
|
||||
@ -309,23 +372,39 @@ class NGramDecodingConfig(DecodingBaseConfig):
|
||||
|
||||
decoding_type: ClassVar[str] = "NGram"
|
||||
|
||||
def supports_backend(self, backend: str) -> bool:
|
||||
return backend == "pytorch"
|
||||
|
||||
|
||||
class DraftTargetDecodingConfig(DecodingBaseConfig):
|
||||
pytorch_weights_path: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict):
|
||||
return cls(**data)
|
||||
|
||||
decoding_type: ClassVar[str] = "DraftTarget"
|
||||
decoding_type: ClassVar[str] = "Draft_Target"
|
||||
|
||||
def supports_backend(self, backend: str) -> bool:
|
||||
return backend == "pytorch"
|
||||
|
||||
|
||||
class MTPDecodingConfig(DecodingBaseConfig):
|
||||
num_nextn_predict_layers: Optional[int] = 1
|
||||
use_relaxed_acceptance_for_thinking: Optional[bool] = False
|
||||
relaxed_topk: Optional[int] = 1
|
||||
relaxed_delta: Optional[float] = 0.
|
||||
use_mtp_vanilla: Optional[bool] = False
|
||||
num_nextn_predict_layers: int = 1
|
||||
use_relaxed_acceptance_for_thinking: bool = False
|
||||
relaxed_topk: int = 1
|
||||
relaxed_delta: float = 0.
|
||||
use_mtp_vanilla: bool = False
|
||||
|
||||
# TODO: remove this after distinguishing `max_draft_len` and `num_nextn_predict_layers`
|
||||
# Now we need a flag when MTPDecodingConfig is updated by PyTorchModelEngine.
|
||||
num_nextn_predict_layers_from_model_config: int = 1
|
||||
|
||||
# TODO: Hard code for DeepSeek R1
|
||||
# When encounter <think>, start thinking phase.
|
||||
# When encounter </think>, end thinking phase.
|
||||
# <think> [thinking phase] </think> [real output]
|
||||
BEGIN_THINKING_PHASE_TOKEN: int = 128798
|
||||
END_THINKING_PHASE_TOKEN: int = 128799
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict):
|
||||
@ -333,6 +412,22 @@ class MTPDecodingConfig(DecodingBaseConfig):
|
||||
|
||||
decoding_type: ClassVar[str] = "MTP"
|
||||
|
||||
def supports_backend(self, backend: str) -> bool:
|
||||
return backend == "pytorch"
|
||||
|
||||
@functools.cached_property
|
||||
def spec_dec_mode(self):
|
||||
from tensorrt_llm._torch.speculative.interface import \
|
||||
SpeculativeDecodingMode as TorchSpeculativeDecodingMode
|
||||
if self.num_nextn_predict_layers_from_model_config == 1 and not self.use_mtp_vanilla:
|
||||
return TorchSpeculativeDecodingMode.MTP_EAGLE
|
||||
return TorchSpeculativeDecodingMode.MTP
|
||||
|
||||
def update_from_model_config(self, model_config):
|
||||
assert self.num_nextn_predict_layers > 0
|
||||
if model_config.num_nextn_predict_layers == 1 and not self.use_mtp_vanilla:
|
||||
self.num_extra_kv_tokens = self.num_nextn_predict_layers - 1
|
||||
|
||||
|
||||
class PybindMirror(ABC):
|
||||
''' A class containing the utilities for mirroring Python classes to
|
||||
@ -623,6 +718,9 @@ class LookaheadDecodingConfig(DecodingBaseConfig, PybindMirror):
|
||||
self.max_ngram_size,
|
||||
self.max_verification_set_size)
|
||||
|
||||
def supports_backend(self, backend: str) -> bool:
|
||||
return backend not in ("pytorch", "_autodeploy")
|
||||
|
||||
decoding_type: ClassVar[str] = "Lookahead"
|
||||
|
||||
|
||||
@ -633,6 +731,7 @@ SpeculativeConfig: TypeAlias = Optional[Union[
|
||||
MedusaDecodingConfig,
|
||||
MTPDecodingConfig,
|
||||
NGramDecodingConfig,
|
||||
UserProvidedDecodingConfig,
|
||||
]]
|
||||
|
||||
|
||||
@ -1024,7 +1123,7 @@ class BaseLlmArgs(BaseModel):
|
||||
return self._model_format
|
||||
|
||||
@property
|
||||
def speculative_model(self) -> Optional[_ModelFormatKind]:
|
||||
def speculative_model_dir(self) -> Optional[_ModelFormatKind]:
|
||||
return self._speculative_model
|
||||
|
||||
@property
|
||||
@ -1301,33 +1400,40 @@ class BaseLlmArgs(BaseModel):
|
||||
@model_validator(mode="after")
|
||||
def validate_speculative_config(self):
|
||||
if self.speculative_config:
|
||||
if isinstance(self.speculative_config, LookaheadDecodingConfig):
|
||||
lookahead_config = self.speculative_config
|
||||
# Update the build config
|
||||
_, _, max_draft_tokens, _ = lookahead_config.calculate_speculative_resource(
|
||||
)
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.LOOKAHEAD_DECODING
|
||||
if max_draft_tokens > self.build_config.max_draft_len:
|
||||
self.build_config.max_draft_len = max_draft_tokens
|
||||
if not self.speculative_config.supports_backend(self.backend):
|
||||
raise ValueError(
|
||||
f"Speculation type {self.speculative_config.decoding_type} does not "
|
||||
f"support backend {self.backend}")
|
||||
|
||||
# Below, we only need to set speculative_decoding_mode/decoding_config for speculation
|
||||
# on the TRT backend.
|
||||
if isinstance(self.speculative_config, LookaheadDecodingConfig):
|
||||
max_draft_len = self.speculative_config.calculate_speculative_resource(
|
||||
)[2]
|
||||
assert max_draft_len > 0
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.LOOKAHEAD_DECODING
|
||||
self.build_config.max_draft_len = max(
|
||||
self.build_config.max_draft_len, max_draft_len)
|
||||
self.decoding_config = DecodingConfig(
|
||||
decoding_mode=DecodingMode.Lookahead(),
|
||||
lookahead_decoding_config=PybindMirror.maybe_to_pybind(
|
||||
lookahead_config))
|
||||
elif isinstance(self.speculative_config, MedusaDecodingConfig):
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.MEDUSA
|
||||
self.speculative_config))
|
||||
|
||||
elif isinstance(self.speculative_config, MedusaDecodingConfig):
|
||||
assert self.speculative_config.max_draft_len > 0
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.MEDUSA
|
||||
self.build_config.max_draft_len = self.speculative_config.max_draft_len
|
||||
self.decoding_config = DecodingConfig(
|
||||
decoding_mode=DecodingMode.Medusa(),
|
||||
medusa_choices=self.speculative_config.medusa_choices)
|
||||
|
||||
elif isinstance(self.speculative_config, EagleDecodingConfig):
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE
|
||||
assert self.speculative_config.max_draft_len > 0
|
||||
|
||||
assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified."
|
||||
self.build_config.max_draft_len = self.speculative_config.max_draft_len
|
||||
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE
|
||||
if self.speculative_config.eagle3_one_model:
|
||||
self.speculative_config.num_extra_kv_tokens = self.speculative_config.max_draft_len - 1
|
||||
if self.backend not in ['pytorch', '_autodeploy']:
|
||||
eagle_config = _EagleConfig(
|
||||
self.speculative_config.eagle_choices,
|
||||
@ -1338,59 +1444,39 @@ class BaseLlmArgs(BaseModel):
|
||||
self.decoding_config = DecodingConfig(
|
||||
decoding_mode=DecodingMode.Eagle(),
|
||||
eagle_config=eagle_config)
|
||||
else:
|
||||
from tensorrt_llm._torch.speculative import Eagle3Config
|
||||
self.speculative_config = Eagle3Config(
|
||||
max_draft_tokens=self.speculative_config.max_draft_len,
|
||||
draft_model_path=self.speculative_config.
|
||||
pytorch_weights_path,
|
||||
eagle3_one_model=self.speculative_config.
|
||||
eagle3_one_model)
|
||||
|
||||
elif isinstance(self.speculative_config, NGramDecodingConfig):
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.NGRAM
|
||||
assert self.backend in ['pytorch', '_autodeploy']
|
||||
assert self.speculative_config.prompt_lookup_num_tokens > 0 and self.speculative_config.max_matching_ngram_size > 0
|
||||
assert self.speculative_config.max_draft_len > 0 and self.speculative_config.max_matching_ngram_size > 0
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.NGRAM
|
||||
self.build_config.max_draft_len = self.speculative_config.max_draft_len
|
||||
from tensorrt_llm._torch.speculative import NGramConfig
|
||||
self.speculative_config = NGramConfig(
|
||||
prompt_lookup_num_tokens=self.speculative_config.
|
||||
prompt_lookup_num_tokens,
|
||||
max_matching_ngram_size=self.speculative_config.
|
||||
max_matching_ngram_size,
|
||||
is_keep_all=self.speculative_config.is_keep_all,
|
||||
is_use_oldest=self.speculative_config.is_use_oldest,
|
||||
is_public_pool=self.speculative_config.is_public_pool,
|
||||
)
|
||||
|
||||
elif isinstance(self.speculative_config, DraftTargetDecodingConfig):
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL
|
||||
assert self.backend == 'pytorch'
|
||||
assert self.backend in ['pytorch']
|
||||
assert self.speculative_config.max_draft_len > 0
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL
|
||||
self.build_config.max_draft_len = self.speculative_config.max_draft_len
|
||||
from tensorrt_llm._torch.speculative import DraftTargetConfig
|
||||
self.speculative_config = DraftTargetConfig(
|
||||
max_draft_tokens=self.speculative_config.max_draft_len,
|
||||
draft_model_path=self.speculative_config.
|
||||
pytorch_weights_path)
|
||||
|
||||
elif isinstance(self.speculative_config, MTPDecodingConfig):
|
||||
from tensorrt_llm._torch.speculative import MTPConfig
|
||||
self.speculative_config = MTPConfig(
|
||||
num_nextn_predict_layers=self.speculative_config.
|
||||
num_nextn_predict_layers,
|
||||
max_batch_size=self.build_config.max_batch_size,
|
||||
use_relaxed_acceptance_for_thinking=self.speculative_config.
|
||||
use_relaxed_acceptance_for_thinking,
|
||||
relaxed_topk=self.speculative_config.relaxed_topk,
|
||||
relaxed_delta=self.speculative_config.relaxed_delta,
|
||||
use_mtp_vanilla=self.speculative_config.use_mtp_vanilla)
|
||||
assert self.speculative_config.num_nextn_predict_layers > 0
|
||||
self.speculative_config.max_draft_len = self.speculative_config.num_nextn_predict_layers
|
||||
|
||||
elif isinstance(self.speculative_config,
|
||||
UserProvidedDecodingConfig):
|
||||
assert self.backend in ['pytorch', '_autodeploy']
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.USER_PROVIDED
|
||||
self.build_config.max_draft_len = self.speculative_config.max_draft_len
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Speculative config type not recognized: {self.speculative_config}"
|
||||
f"Unrecognized speculative config type {type(self.speculative_config)}"
|
||||
)
|
||||
|
||||
else:
|
||||
self.decoding_config = None
|
||||
|
||||
self._speculative_model = getattr(self.speculative_config,
|
||||
"speculative_model", None)
|
||||
"speculative_model_dir", None)
|
||||
speculative_model_obj = _ModelWrapper(
|
||||
self._speculative_model
|
||||
) if self._speculative_model is not None else None
|
||||
@ -1702,7 +1788,7 @@ class TorchLlmArgs(BaseLlmArgs):
|
||||
moe_backend: str = Field(default='CUTLASS',
|
||||
description="MoE backend to use.")
|
||||
|
||||
mixed_sampler: bool = Field(
|
||||
enable_mixed_sampler: bool = Field(
|
||||
default=False,
|
||||
description=
|
||||
"If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
|
||||
@ -1732,7 +1818,7 @@ class TorchLlmArgs(BaseLlmArgs):
|
||||
torch_compile_config: Optional[TorchCompileConfig] = Field(
|
||||
default=None, description="Torch compile config.")
|
||||
|
||||
autotuner_enabled: bool = Field(
|
||||
enable_autotuner: bool = Field(
|
||||
default=True,
|
||||
description="Enable autotuner only when torch compile is enabled.")
|
||||
|
||||
@ -1918,7 +2004,7 @@ class TorchLlmArgs(BaseLlmArgs):
|
||||
moe_load_balancer=self.moe_load_balancer,
|
||||
attn_backend=self.attn_backend,
|
||||
moe_backend=self.moe_backend,
|
||||
mixed_sampler=self.mixed_sampler,
|
||||
enable_mixed_sampler=self.enable_mixed_sampler,
|
||||
enable_trtllm_sampler=self.enable_trtllm_sampler,
|
||||
kv_cache_dtype=self.kv_cache_dtype,
|
||||
enable_iter_perf_stats=self.enable_iter_perf_stats,
|
||||
@ -1938,7 +2024,7 @@ class TorchLlmArgs(BaseLlmArgs):
|
||||
torch_compile_enable_userbuffers=self.torch_compile_config.
|
||||
enable_userbuffers if self.torch_compile_config is not None else
|
||||
TorchCompileConfig.model_fields['enable_userbuffers'].default,
|
||||
autotuner_enabled=self.autotuner_enabled,
|
||||
enable_autotuner=self.enable_autotuner,
|
||||
enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker,
|
||||
load_format=self.load_format,
|
||||
enable_min_latency=self.enable_min_latency,
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 39 KiB |
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -671,9 +672,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1980,9 +1981,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -651,9 +652,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -872,17 +873,16 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">detokenize</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">for</span> <span class="n">beam_output</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">outputs</span><span class="p">:</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">_last_text_len</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">beam_output</span><span class="o">.</span><span class="n">text</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">,</span> <span class="s1">'decode_incrementally'</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_streaming</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span><span class="p">:</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">text</span><span class="p">,</span> <span class="n">beam_output</span><span class="o">.</span><span class="n">_incremental_states</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode_incrementally</span><span class="p">(</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">token_ids_diff</span><span class="p">,</span>
|
||||
<span class="n">prev_text</span><span class="o">=</span><span class="n">beam_output</span><span class="o">.</span><span class="n">text</span><span class="p">,</span>
|
||||
<span class="n">states</span><span class="o">=</span><span class="n">beam_output</span><span class="o">.</span><span class="n">_incremental_states</span><span class="p">,</span>
|
||||
<span class="n">flush</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_done</span><span class="p">,</span>
|
||||
<span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">text</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode_incrementally</span><span class="p">(</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">token_ids</span><span class="p">,</span> <span class="n">flush</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_done</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">,</span> <span class="s1">'decode_incrementally'</span>
|
||||
<span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_streaming</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span><span class="p">:</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">text</span><span class="p">,</span> <span class="n">beam_output</span><span class="o">.</span><span class="n">_incremental_states</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode_incrementally</span><span class="p">(</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">token_ids_diff</span><span class="p">,</span>
|
||||
<span class="n">prev_text</span><span class="o">=</span><span class="n">beam_output</span><span class="o">.</span><span class="n">text</span><span class="p">,</span>
|
||||
<span class="n">states</span><span class="o">=</span><span class="n">beam_output</span><span class="o">.</span><span class="n">_incremental_states</span><span class="p">,</span>
|
||||
<span class="n">flush</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_done</span><span class="p">,</span>
|
||||
<span class="n">stream_interval</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">_stream_interval</span><span class="p">,</span>
|
||||
<span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">text</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span>
|
||||
<span class="n">beam_output</span><span class="o">.</span><span class="n">token_ids</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
@ -1268,9 +1268,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -774,9 +775,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -8705,9 +8706,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -630,9 +631,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -3495,9 +3496,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -637,9 +638,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -886,9 +887,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1353,9 +1354,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1201,9 +1202,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1227,9 +1228,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -991,9 +992,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -646,9 +647,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -930,9 +931,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -506,6 +507,7 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.data</span><span class="w"> </span><span class="kn">import</span> <span class="n">TextPrompt</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.multimodal</span><span class="w"> </span><span class="kn">import</span> <span class="n">MultimodalParams</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.registry</span><span class="w"> </span><span class="kn">import</span> <span class="n">DefaultInputProcessor</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">nvtx_range_debug</span>
|
||||
@ -856,9 +858,8 @@
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">add_special_tokens</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
|
||||
<span class="n">query_token_ids</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">multimodal_input</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">multimodal_embedding</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">mrope_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">multimodal_params</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="k">if</span> <span class="s2">"prompt_token_ids"</span> <span class="ow">in</span> <span class="n">inputs</span><span class="p">:</span>
|
||||
<span class="c1"># TODO: if specify prompt_token_ids, the mm hashing is not supported yet</span>
|
||||
<span class="n">prompt_token_ids</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="s1">'prompt_token_ids'</span><span class="p">]</span>
|
||||
@ -883,11 +884,15 @@
|
||||
<span class="n">prompt</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="s1">'prompt'</span><span class="p">]</span>
|
||||
<span class="k">if</span> <span class="n">extra_processed_inputs</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">query_token_ids</span> <span class="o">=</span> <span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'query_token_ids'</span><span class="p">)</span>
|
||||
<span class="n">multimodal_embedding</span> <span class="o">=</span> <span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s1">'mm_embedding'</span><span class="p">)</span>
|
||||
<span class="n">mrope_config</span> <span class="o">=</span> <span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'mrope_config'</span><span class="p">)</span>
|
||||
<span class="n">multimodal_input</span> <span class="o">=</span> <span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s1">'multimodal_input'</span><span class="p">)</span>
|
||||
<span class="c1"># Create unified MultimodalParams</span>
|
||||
<span class="n">multimodal_params</span> <span class="o">=</span> <span class="n">MultimodalParams</span><span class="p">(</span>
|
||||
<span class="n">multimodal_input</span><span class="o">=</span><span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s1">'multimodal_input'</span><span class="p">),</span>
|
||||
<span class="n">multimodal_data</span><span class="o">=</span><span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s1">'multimodal_data'</span><span class="p">))</span>
|
||||
<span class="c1"># Only pass it if it has content</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="n">multimodal_params</span><span class="o">.</span><span class="n">has_content</span><span class="p">():</span>
|
||||
<span class="n">multimodal_params</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"The inputs must be type str or list of int, but got </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
|
||||
@ -907,12 +912,10 @@
|
||||
<span class="n">lora_request</span><span class="o">=</span><span class="n">lora_request</span><span class="p">,</span>
|
||||
<span class="n">prompt_adapter_request</span><span class="o">=</span><span class="n">prompt_adapter_request</span><span class="p">,</span>
|
||||
<span class="n">streaming</span><span class="o">=</span><span class="n">streaming</span><span class="p">,</span>
|
||||
<span class="n">multimodal_input</span><span class="o">=</span><span class="n">multimodal_input</span><span class="p">,</span>
|
||||
<span class="n">multimodal_embedding</span><span class="o">=</span><span class="n">multimodal_embedding</span><span class="p">,</span>
|
||||
<span class="n">mrope_config</span><span class="o">=</span><span class="n">mrope_config</span><span class="p">,</span>
|
||||
<span class="n">kv_cache_retention_config</span><span class="o">=</span><span class="n">kv_cache_retention_config</span><span class="p">,</span>
|
||||
<span class="n">disaggregated_params</span><span class="o">=</span><span class="n">disaggregated_params</span><span class="p">,</span>
|
||||
<span class="n">postproc_params</span><span class="o">=</span><span class="n">_postproc_params</span><span class="p">,</span>
|
||||
<span class="n">multimodal_params</span><span class="o">=</span><span class="n">multimodal_params</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">RequestOutput</span><span class="o">.</span><span class="n">_from_generation_result</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span>
|
||||
@ -996,8 +999,8 @@
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="s2">"tokenizer is required to initialize a default sampling_params, or you can explicitly specify a sampling_params"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">return</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">end_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">eos_token_id</span><span class="p">,</span>
|
||||
<span class="n">pad_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span><span class="p">)</span>
|
||||
<span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">end_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">eos_token_id</span><span class="p">,</span>
|
||||
<span class="n">pad_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">sampling_params</span><span class="p">,</span> <span class="n">SamplingParams</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">end_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
@ -1005,21 +1008,26 @@
|
||||
<span class="s2">"tokenizer is required to reset end_id if it is None, or you can explicitly specify the end_id for sampling_params"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_setup</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">)</span>
|
||||
<span class="c1"># auto enabled context and/or generation logits flags, as they are required by logprob computation for TRT backend.</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"pytorch"</span><span class="p">,</span> <span class="s2">"_autodeploy"</span><span class="p">]:</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">return_context_logits</span><span class="p">:</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">return_context_logits</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_context_logits_auto_enabled</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">return_generation_logits</span><span class="p">:</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">return_generation_logits</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_generation_logits_auto_enabled</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">sampling_params</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"The sampling_params must be type SamplingParams or None, but got </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">sampling_params</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="c1"># auto enabled context and/or generation logits flags, as they are required by logprob computation for TRT backend.</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"pytorch"</span><span class="p">,</span> <span class="s2">"_autodeploy"</span><span class="p">]:</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">return_context_logits</span><span class="p">:</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">return_context_logits</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_context_logits_auto_enabled</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">return_generation_logits</span><span class="p">:</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">return_generation_logits</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_generation_logits_auto_enabled</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">_stream_interval</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_stream_interval</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span>
|
||||
<span class="s2">"stream_interval"</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">sampling_params</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_check_arguments</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prompt_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">query_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
|
||||
<span class="n">sampling_params</span><span class="p">:</span> <span class="n">SamplingParams</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
|
||||
@ -1642,9 +1650,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -492,6 +493,7 @@
|
||||
|
||||
<h1>Source code for tensorrt_llm.llmapi.llm_args</h1><div class="highlight"><pre>
|
||||
<span></span><span class="kn">import</span><span class="w"> </span><span class="nn">copy</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">functools</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">json</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">math</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
|
||||
@ -730,7 +732,8 @@
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">DecodingBaseConfig</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
|
||||
<span class="n">max_draft_len</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">speculative_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">speculative_model_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">num_extra_kv_tokens</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
|
||||
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
|
||||
@ -743,6 +746,7 @@
|
||||
<span class="s2">"Lookahead"</span><span class="p">:</span> <span class="n">LookaheadDecodingConfig</span><span class="p">,</span>
|
||||
<span class="s2">"NGram"</span><span class="p">:</span> <span class="n">NGramDecodingConfig</span><span class="p">,</span>
|
||||
<span class="s2">"DraftTarget"</span><span class="p">:</span> <span class="n">DraftTargetDecodingConfig</span><span class="p">,</span>
|
||||
<span class="s2">"UserProvided"</span><span class="p">:</span> <span class="n">UserProvidedDecodingConfig</span><span class="p">,</span>
|
||||
<span class="p">}</span>
|
||||
|
||||
<span class="n">config_class</span> <span class="o">=</span> <span class="n">config_classes</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">decoding_type</span><span class="p">)</span>
|
||||
@ -754,6 +758,35 @@
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_check_fields</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">pass</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">backend</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Override if the speculation algorithm does not support</span>
|
||||
<span class="sd"> a subset of the possible backends.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">return</span> <span class="kc">True</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Do any additional error checking here.</span>
|
||||
<span class="sd"> """</span>
|
||||
|
||||
<span class="nd">@functools</span><span class="o">.</span><span class="n">cached_property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">spec_dec_mode</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="c1"># spec_dec_mode has more functionality than the raw decoding_mode string.</span>
|
||||
<span class="c1"># Use an alias for the import here to avoid name collisions with the one for the</span>
|
||||
<span class="c1"># TRT backend.</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative.interface</span><span class="w"> </span><span class="kn">import</span> \
|
||||
<span class="n">SpeculativeDecodingMode</span> <span class="k">as</span> <span class="n">TorchSpeculativeDecodingMode</span>
|
||||
<span class="k">return</span> <span class="n">TorchSpeculativeDecodingMode</span><span class="o">.</span><span class="n">from_string</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_type</span><span class="o">.</span><span class="n">upper</span><span class="p">())</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">update_from_model_config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model_config</span><span class="p">):</span>
|
||||
<span class="k">pass</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_draft_model_prompt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
||||
<span class="n">input_tokens</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">)</span> <span class="o">-></span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">input_tokens</span>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="MedusaDecodingConfig">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MedusaDecodingConfig">[docs]</a>
|
||||
@ -768,7 +801,13 @@
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"Medusa"</span></div>
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"Medusa"</span>
|
||||
|
||||
<div class="viewcode-block" id="MedusaDecodingConfig.supports_backend">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MedusaDecodingConfig.supports_backend">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">backend</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"pytorch"</span><span class="p">,</span> <span class="s2">"_autodeploy"</span><span class="p">)</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -782,7 +821,6 @@
|
||||
<span class="n">dynamic_tree_max_topK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">num_eagle_layers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">max_non_leaves_per_layer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">pytorch_weights_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">eagle3_one_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<div class="viewcode-block" id="EagleDecodingConfig.from_dict">
|
||||
@ -792,7 +830,49 @@
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"Eagle"</span></div>
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"Eagle"</span>
|
||||
|
||||
<div class="viewcode-block" id="EagleDecodingConfig.validate">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.validate">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_model_dir</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Draft model must be provided for EAGLE"</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="nd">@functools</span><span class="o">.</span><span class="n">cached_property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">spec_dec_mode</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative.interface</span><span class="w"> </span><span class="kn">import</span> \
|
||||
<span class="n">SpeculativeDecodingMode</span> <span class="k">as</span> <span class="n">TorchSpeculativeDecodingMode</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">eagle3_one_model</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">TorchSpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE3_ONE_MODEL</span>
|
||||
<span class="k">return</span> <span class="n">TorchSpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE3</span>
|
||||
|
||||
<div class="viewcode-block" id="EagleDecodingConfig.get_draft_model_prompt">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.get_draft_model_prompt">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_draft_model_prompt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span>
|
||||
<span class="n">input_tokens</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">)</span> <span class="o">-></span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Eagle3 always throws away the first token when processing draft inputs</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">return</span> <span class="n">input_tokens</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="viewcode-block" id="UserProvidedDecodingConfig">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.UserProvidedDecodingConfig">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">UserProvidedDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
|
||||
<span class="c1"># Type should be Drafter, but it leads to circular import</span>
|
||||
<span class="n">drafter</span><span class="p">:</span> <span class="nb">object</span>
|
||||
|
||||
<div class="viewcode-block" id="UserProvidedDecodingConfig.from_dict">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.UserProvidedDecodingConfig.from_dict">[docs]</a>
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"User_Provided"</span></div>
|
||||
|
||||
|
||||
|
||||
@ -803,7 +883,7 @@
|
||||
<span class="sd"> Configuration for NGram drafter speculative decoding.</span>
|
||||
|
||||
<span class="sd"> Arguments:</span>
|
||||
<span class="sd"> prompt_lookup_num_tokens: int</span>
|
||||
<span class="sd"> max_draft_len: int</span>
|
||||
<span class="sd"> The length maximum of draft tokens (can be understood as length maximum of output draft tokens).</span>
|
||||
|
||||
<span class="sd"> max_matching_ngram_size: int</span>
|
||||
@ -819,7 +899,6 @@
|
||||
<span class="sd"> Whether to use a common pool for all requests, or the pool is private for each request if False.</span>
|
||||
<span class="sd"> """</span>
|
||||
|
||||
<span class="n">prompt_lookup_num_tokens</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span>
|
||||
<span class="n">max_matching_ngram_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span>
|
||||
<span class="n">is_keep_all</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">is_use_oldest</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
@ -832,14 +911,19 @@
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"NGram"</span></div>
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"NGram"</span>
|
||||
|
||||
<div class="viewcode-block" id="NGramDecodingConfig.supports_backend">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.supports_backend">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">backend</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="viewcode-block" id="DraftTargetDecodingConfig">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.DraftTargetDecodingConfig">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">DraftTargetDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
|
||||
<span class="n">pytorch_weights_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<div class="viewcode-block" id="DraftTargetDecodingConfig.from_dict">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.DraftTargetDecodingConfig.from_dict">[docs]</a>
|
||||
@ -848,18 +932,35 @@
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"DraftTarget"</span></div>
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"Draft_Target"</span>
|
||||
|
||||
<div class="viewcode-block" id="DraftTargetDecodingConfig.supports_backend">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.DraftTargetDecodingConfig.supports_backend">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">backend</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="viewcode-block" id="MTPDecodingConfig">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">MTPDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
|
||||
<span class="n">num_nextn_predict_layers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="n">use_relaxed_acceptance_for_thinking</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
<span class="n">relaxed_topk</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="n">relaxed_delta</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mf">0.</span>
|
||||
<span class="n">use_mtp_vanilla</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
<span class="n">num_nextn_predict_layers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="n">use_relaxed_acceptance_for_thinking</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
<span class="n">relaxed_topk</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
<span class="n">relaxed_delta</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.</span>
|
||||
<span class="n">use_mtp_vanilla</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
|
||||
<span class="c1"># TODO: remove this after distinguishing `max_draft_len` and `num_nextn_predict_layers`</span>
|
||||
<span class="c1"># Now we need a flag when MTPDecodingConfig is updated by PyTorchModelEngine.</span>
|
||||
<span class="n">num_nextn_predict_layers_from_model_config</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
|
||||
<span class="c1"># TODO: Hard code for DeepSeek R1</span>
|
||||
<span class="c1"># When encounter <think>, start thinking phase.</span>
|
||||
<span class="c1"># When encounter </think>, end thinking phase.</span>
|
||||
<span class="c1"># <think> [thinking phase] </think> [real output]</span>
|
||||
<span class="n">BEGIN_THINKING_PHASE_TOKEN</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">128798</span>
|
||||
<span class="n">END_THINKING_PHASE_TOKEN</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">128799</span>
|
||||
|
||||
<div class="viewcode-block" id="MTPDecodingConfig.from_dict">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig.from_dict">[docs]</a>
|
||||
@ -868,7 +969,29 @@
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"MTP"</span></div>
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"MTP"</span>
|
||||
|
||||
<div class="viewcode-block" id="MTPDecodingConfig.supports_backend">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig.supports_backend">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">backend</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span></div>
|
||||
|
||||
|
||||
<span class="nd">@functools</span><span class="o">.</span><span class="n">cached_property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">spec_dec_mode</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative.interface</span><span class="w"> </span><span class="kn">import</span> \
|
||||
<span class="n">SpeculativeDecodingMode</span> <span class="k">as</span> <span class="n">TorchSpeculativeDecodingMode</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_nextn_predict_layers_from_model_config</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_mtp_vanilla</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">TorchSpeculativeDecodingMode</span><span class="o">.</span><span class="n">MTP_EAGLE</span>
|
||||
<span class="k">return</span> <span class="n">TorchSpeculativeDecodingMode</span><span class="o">.</span><span class="n">MTP</span>
|
||||
|
||||
<div class="viewcode-block" id="MTPDecodingConfig.update_from_model_config">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig.update_from_model_config">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">update_from_model_config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model_config</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">if</span> <span class="n">model_config</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">use_mtp_vanilla</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">num_extra_kv_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span> <span class="o">-</span> <span class="mi">1</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -1190,6 +1313,12 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_ngram_size</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">max_verification_set_size</span><span class="p">)</span>
|
||||
|
||||
<div class="viewcode-block" id="LookaheadDecodingConfig.supports_backend">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LookaheadDecodingConfig.supports_backend">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">backend</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"pytorch"</span><span class="p">,</span> <span class="s2">"_autodeploy"</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"Lookahead"</span></div>
|
||||
|
||||
|
||||
@ -1201,6 +1330,7 @@
|
||||
<span class="n">MedusaDecodingConfig</span><span class="p">,</span>
|
||||
<span class="n">MTPDecodingConfig</span><span class="p">,</span>
|
||||
<span class="n">NGramDecodingConfig</span><span class="p">,</span>
|
||||
<span class="n">UserProvidedDecodingConfig</span><span class="p">,</span>
|
||||
<span class="p">]]</span>
|
||||
|
||||
|
||||
@ -1601,7 +1731,7 @@
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_model_format</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">speculative_model</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">_ModelFormatKind</span><span class="p">]:</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">speculative_model_dir</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">_ModelFormatKind</span><span class="p">]:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
@ -1878,33 +2008,40 @@
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">"after"</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_speculative_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">LookaheadDecodingConfig</span><span class="p">):</span>
|
||||
<span class="n">lookahead_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span>
|
||||
<span class="c1"># Update the build config</span>
|
||||
<span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">max_draft_tokens</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">lookahead_config</span><span class="o">.</span><span class="n">calculate_speculative_resource</span><span class="p">(</span>
|
||||
<span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">LOOKAHEAD_DECODING</span>
|
||||
<span class="k">if</span> <span class="n">max_draft_tokens</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="n">max_draft_tokens</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">supports_backend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Speculation type </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">decoding_type</span><span class="si">}</span><span class="s2"> does not "</span>
|
||||
<span class="sa">f</span><span class="s2">"support backend </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Below, we only need to set speculative_decoding_mode/decoding_config for speculation</span>
|
||||
<span class="c1"># on the TRT backend.</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">LookaheadDecodingConfig</span><span class="p">):</span>
|
||||
<span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">calculate_speculative_resource</span><span class="p">(</span>
|
||||
<span class="p">)[</span><span class="mi">2</span><span class="p">]</span>
|
||||
<span class="k">assert</span> <span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">LOOKAHEAD_DECODING</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span> <span class="n">max_draft_len</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Lookahead</span><span class="p">(),</span>
|
||||
<span class="n">lookahead_decoding_config</span><span class="o">=</span><span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
|
||||
<span class="n">lookahead_config</span><span class="p">))</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">MEDUSA</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">))</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">MEDUSA</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Medusa</span><span class="p">(),</span>
|
||||
<span class="n">medusa_choices</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">medusa_choices</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">speculative_model_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">"Path to EAGLE3 weights must be specified."</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">eagle3_one_model</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">num_extra_kv_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">-</span> <span class="mi">1</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span><span class="p">]:</span>
|
||||
<span class="n">eagle_config</span> <span class="o">=</span> <span class="n">_EagleConfig</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">eagle_choices</span><span class="p">,</span>
|
||||
@ -1915,59 +2052,39 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="n">DecodingConfig</span><span class="p">(</span>
|
||||
<span class="n">decoding_mode</span><span class="o">=</span><span class="n">DecodingMode</span><span class="o">.</span><span class="n">Eagle</span><span class="p">(),</span>
|
||||
<span class="n">eagle_config</span><span class="o">=</span><span class="n">eagle_config</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">Eagle3Config</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">Eagle3Config</span><span class="p">(</span>
|
||||
<span class="n">max_draft_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span>
|
||||
<span class="n">draft_model_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">pytorch_weights_path</span><span class="p">,</span>
|
||||
<span class="n">eagle3_one_model</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">eagle3_one_model</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span><span class="p">]</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">prompt_lookup_num_tokens</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_matching_ngram_size</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_matching_ngram_size</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">NGramConfig</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">NGramConfig</span><span class="p">(</span>
|
||||
<span class="n">prompt_lookup_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">prompt_lookup_num_tokens</span><span class="p">,</span>
|
||||
<span class="n">max_matching_ngram_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">max_matching_ngram_size</span><span class="p">,</span>
|
||||
<span class="n">is_keep_all</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_keep_all</span><span class="p">,</span>
|
||||
<span class="n">is_use_oldest</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_use_oldest</span><span class="p">,</span>
|
||||
<span class="n">is_public_pool</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_public_pool</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">DraftTargetDecodingConfig</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">DRAFT_TOKENS_EXTERNAL</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s1">'pytorch'</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">]</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">DRAFT_TOKENS_EXTERNAL</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">DraftTargetConfig</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">DraftTargetConfig</span><span class="p">(</span>
|
||||
<span class="n">max_draft_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span>
|
||||
<span class="n">draft_model_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">pytorch_weights_path</span><span class="p">)</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MTPDecodingConfig</span><span class="p">):</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">MTPConfig</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">MTPConfig</span><span class="p">(</span>
|
||||
<span class="n">num_nextn_predict_layers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">num_nextn_predict_layers</span><span class="p">,</span>
|
||||
<span class="n">max_batch_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">,</span>
|
||||
<span class="n">use_relaxed_acceptance_for_thinking</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">use_relaxed_acceptance_for_thinking</span><span class="p">,</span>
|
||||
<span class="n">relaxed_topk</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">relaxed_topk</span><span class="p">,</span>
|
||||
<span class="n">relaxed_delta</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">relaxed_delta</span><span class="p">,</span>
|
||||
<span class="n">use_mtp_vanilla</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">use_mtp_vanilla</span><span class="p">)</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">num_nextn_predict_layers</span>
|
||||
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="n">UserProvidedDecodingConfig</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'pytorch'</span><span class="p">,</span> <span class="s1">'_autodeploy'</span><span class="p">]</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">USER_PROVIDED</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Speculative config type not recognized: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="sa">f</span><span class="s2">"Unrecognized speculative config type </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">)</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">decoding_config</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="s2">"speculative_model"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="s2">"speculative_model_dir"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_model_obj</span> <span class="o">=</span> <span class="n">_ModelWrapper</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span>
|
||||
<span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_speculative_model</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
|
||||
@ -2299,7 +2416,7 @@
|
||||
<span class="n">moe_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">'CUTLASS'</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"MoE backend to use."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">mixed_sampler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">enable_mixed_sampler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."</span>
|
||||
@ -2329,7 +2446,7 @@
|
||||
<span class="n">torch_compile_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">TorchCompileConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Torch compile config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">autotuner_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">enable_autotuner</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable autotuner only when torch compile is enabled."</span><span class="p">)</span>
|
||||
|
||||
@ -2532,7 +2649,7 @@
|
||||
<span class="n">moe_load_balancer</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">,</span>
|
||||
<span class="n">attn_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">attn_backend</span><span class="p">,</span>
|
||||
<span class="n">moe_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_backend</span><span class="p">,</span>
|
||||
<span class="n">mixed_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">mixed_sampler</span><span class="p">,</span>
|
||||
<span class="n">enable_mixed_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_mixed_sampler</span><span class="p">,</span>
|
||||
<span class="n">enable_trtllm_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_trtllm_sampler</span><span class="p">,</span>
|
||||
<span class="n">kv_cache_dtype</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_dtype</span><span class="p">,</span>
|
||||
<span class="n">enable_iter_perf_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_perf_stats</span><span class="p">,</span>
|
||||
@ -2552,7 +2669,7 @@
|
||||
<span class="n">torch_compile_enable_userbuffers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span><span class="o">.</span>
|
||||
<span class="n">enable_userbuffers</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span>
|
||||
<span class="n">TorchCompileConfig</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="s1">'enable_userbuffers'</span><span class="p">]</span><span class="o">.</span><span class="n">default</span><span class="p">,</span>
|
||||
<span class="n">autotuner_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">autotuner_enabled</span><span class="p">,</span>
|
||||
<span class="n">enable_autotuner</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_autotuner</span><span class="p">,</span>
|
||||
<span class="n">enable_layerwise_nvtx_marker</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_layerwise_nvtx_marker</span><span class="p">,</span>
|
||||
<span class="n">load_format</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">load_format</span><span class="p">,</span>
|
||||
<span class="n">enable_min_latency</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_min_latency</span><span class="p">,</span>
|
||||
@ -2748,9 +2865,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1132,9 +1133,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -864,9 +865,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1168,9 +1169,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -776,9 +777,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -793,9 +794,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -992,9 +993,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -821,9 +822,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -652,9 +653,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -905,9 +906,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -803,9 +804,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -667,9 +668,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -793,9 +794,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -887,9 +888,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -969,9 +970,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1005,9 +1006,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1448,11 +1449,11 @@
|
||||
<span class="sd"> spec_decoding_position_offsets: [bs, max_gen_tokens]</span>
|
||||
<span class="sd"> spec_decoding_packed_mask: [bs, max_draft_len, packed_length] **</span>
|
||||
<span class="sd"> eagle_temperature: [bs]</span>
|
||||
<span class="sd"> rand_data_validation: [bs, max_draft_tokens]</span>
|
||||
<span class="sd"> rand_data_validation: [bs, max_draft_len]</span>
|
||||
|
||||
<span class="sd"> ** The mask is tricky since the boolean mask will need to be</span>
|
||||
<span class="sd"> packed in runtime. So, the last dim will be:</span>
|
||||
<span class="sd"> packed_length = ceil((max_draft_tokens+1)/32)</span>
|
||||
<span class="sd"> packed_length = ceil((max_draft_len+1)/32)</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">default_range</span> <span class="o">=</span> <span class="n">GenerationMixin</span><span class="o">.</span><span class="n">default_range</span>
|
||||
<span class="n">remove_input_padding</span> <span class="o">=</span> <span class="n">default_net</span><span class="p">()</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">remove_input_padding</span>
|
||||
@ -1731,7 +1732,7 @@
|
||||
<span class="n">quant_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">QuantConfig</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
|
||||
<span class="k">assert</span> <span class="n">hf_model_or_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
|
||||
<span class="n">speculative_model_dir</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'speculative_model'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_model_dir</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'speculative_model_dir'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">tllm_config</span> <span class="o">=</span> <span class="n">EagleConfig</span><span class="o">.</span><span class="n">from_hugging_face</span><span class="p">(</span><span class="n">hf_model_or_dir</span><span class="p">,</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
|
||||
<span class="n">mapping</span><span class="o">=</span><span class="n">mapping</span><span class="p">,</span>
|
||||
@ -1941,9 +1942,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -2846,9 +2847,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -728,9 +729,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -890,9 +891,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -818,9 +819,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -650,10 +651,10 @@
|
||||
<span class="k">if</span> <span class="n">default_net</span><span class="p">()</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">reduce_fusion</span> <span class="k">else</span>
|
||||
<span class="n">AllReduceFusionOp</span><span class="o">.</span><span class="n">NONE</span><span class="p">,</span>
|
||||
<span class="n">residual</span><span class="o">=</span><span class="n">residual</span><span class="p">,</span>
|
||||
<span class="n">norm_weight</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">post_layernorm</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">value</span><span class="p">,</span>
|
||||
<span class="n">norm_pre_residual_weight</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_feedforward_layernorm</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span>
|
||||
<span class="n">value</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">inter_layernorms</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">eps</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">post_layernorm</span><span class="o">.</span><span class="n">eps</span><span class="p">))</span>
|
||||
<span class="n">norm_weight</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_feedforward_layernorm</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">value</span><span class="p">,</span>
|
||||
<span class="n">norm_pre_residual_weight</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">post_layernorm</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">value</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">inter_layernorms</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">eps</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">pre_feedforward_layernorm</span><span class="o">.</span><span class="n">eps</span><span class="p">))</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">use_cache</span><span class="p">:</span>
|
||||
<span class="n">attention_output</span><span class="p">,</span> <span class="n">presents</span> <span class="o">=</span> <span class="n">attention_output</span>
|
||||
@ -1010,9 +1011,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -937,9 +938,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1040,9 +1041,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -666,9 +667,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -818,9 +819,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -758,9 +759,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -892,9 +893,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1240,9 +1241,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1085,9 +1086,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -570,7 +571,7 @@
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">transformers</span>
|
||||
|
||||
<span class="n">trust_remote_code</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">'trust_remote_code'</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
|
||||
<span class="n">speculative_config_or_dir</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">'speculative_model'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_config_or_dir</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">'speculative_model_dir'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_config</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s2">"speculative_config"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">hf_config_or_dir</span><span class="p">,</span> <span class="n">transformers</span><span class="o">.</span><span class="n">PretrainedConfig</span><span class="p">):</span>
|
||||
@ -725,9 +726,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -688,7 +689,7 @@
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">transformers</span>
|
||||
|
||||
<span class="k">assert</span> <span class="n">hf_model_or_dir</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
|
||||
<span class="n">speculative_model_dir</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'speculative_model'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">speculative_model_dir</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'speculative_model_dir'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
|
||||
<span class="n">use_preloading</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">hf_model_or_dir</span><span class="p">,</span>
|
||||
<span class="n">transformers</span><span class="o">.</span><span class="n">PreTrainedModel</span><span class="p">)</span>
|
||||
@ -875,9 +876,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -2186,9 +2187,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1252,9 +1253,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -592,6 +593,7 @@
|
||||
<span class="n">EXPLICIT_DRAFT_TOKENS</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">EAGLE</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">NGRAM</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">USER_PROVIDED</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
|
||||
<div class="viewcode-block" id="SpeculativeDecodingMode.from_arguments">
|
||||
<a class="viewcode-back" href="../../../python-api/tensorrt_llm.models.html#tensorrt_llm.llmapi.SpeculativeDecodingMode.from_arguments">[docs]</a>
|
||||
@ -611,6 +613,8 @@
|
||||
<span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE</span>
|
||||
<span class="k">elif</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">==</span> <span class="s2">"ngram"</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
|
||||
<span class="k">elif</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">==</span> <span class="s2">"user_provided"</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">USER_PROVIDED</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">assert</span> <span class="kc">False</span><span class="p">,</span> <span class="s2">"Unknown speculative_decoding_mode "</span> <span class="o">+</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span></div>
|
||||
</div>
|
||||
@ -2647,9 +2651,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -790,9 +791,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -724,9 +725,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -792,9 +793,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -795,9 +796,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -839,9 +840,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -935,9 +936,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1238,9 +1239,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -672,15 +673,15 @@
|
||||
<span class="n">bb_range</span> <span class="o">=</span> <span class="n">default_range</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">)</span>
|
||||
<span class="n">bb0_range</span> <span class="o">=</span> <span class="n">default_range</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">,</span> <span class="n">min_range</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">opt_offset</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||||
<span class="n">num_beam_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_beams</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">beam_length</span>
|
||||
<span class="n">max_draft_tokens</span> <span class="o">=</span> <span class="n">num_beam_tokens</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_beams</span> <span class="c1"># ignore the true token</span>
|
||||
<span class="n">max_gen_token_len</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">max_draft_tokens</span> <span class="c1"># for the true token</span>
|
||||
<span class="n">max_draft_len</span> <span class="o">=</span> <span class="n">num_beam_tokens</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_beams</span> <span class="c1"># ignore the true token</span>
|
||||
<span class="n">max_gen_token_len</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">max_draft_len</span> <span class="c1"># for the true token</span>
|
||||
<span class="n">max_gen_token_len_range</span> <span class="o">=</span> <span class="n">default_range</span><span class="p">(</span><span class="n">max_gen_token_len</span><span class="p">)</span>
|
||||
<span class="n">bb_max_gen_token_len_range</span> <span class="o">=</span> <span class="n">default_range</span><span class="p">(</span><span class="n">max_gen_token_len</span> <span class="o">*</span>
|
||||
<span class="n">max_batch_size</span><span class="p">,</span>
|
||||
<span class="n">min_range</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||||
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s1">'speculative_decoding_draft_tokens_external'</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s1">'max_draft_len'</span><span class="p">]</span> <span class="o">=</span> <span class="n">max_draft_tokens</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s1">'max_draft_len'</span><span class="p">]</span> <span class="o">=</span> <span class="n">max_draft_len</span>
|
||||
<span class="n">kwargs</span><span class="p">[</span><span class="s1">'spec_decoding_is_generation_length_variable'</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">inputs</span> <span class="o">=</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">prepare_inputs</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="k">assert</span> <span class="n">inputs</span><span class="p">[</span><span class="s1">'spec_decoding_params'</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
|
||||
@ -925,9 +926,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1466,9 +1467,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1034,9 +1035,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1881,9 +1882,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1149,9 +1150,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -5437,9 +5438,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1096,9 +1097,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1608,9 +1609,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1818,9 +1819,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -3407,9 +3408,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -956,9 +957,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -67,7 +67,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -341,6 +341,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -401,7 +402,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -767,6 +768,9 @@
|
||||
<span class="n">truncate_prompt_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">skip_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">spaces_between_special_tokens</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="c1"># Currently, _stream_interval is only used to pass llm.args.stream_interval to tokenizer.</span>
|
||||
<span class="c1"># TODO: make this a per-request parameter.</span>
|
||||
<span class="n">_stream_interval</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">pad_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
@ -1082,9 +1086,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -4,46 +4,16 @@ Executor
|
||||
.. Here are files in the cpp/include/executor
|
||||
.. We manually add subsection to enable detailed description in the future
|
||||
.. It is also doable to automatically generate this file and list all the modules in the conf.py
|
||||
tensor.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: tensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
types.h
|
||||
_______
|
||||
|
||||
.. doxygenfile:: types.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
disaggServerUtil.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: disaggServerUtil.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
dataTransceiverState.h
|
||||
______________________
|
||||
tensor.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: dataTransceiverState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
executor.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: executor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
serialization.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: serialization.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cacheCommunicator.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: cacheCommunicator.h
|
||||
.. doxygenfile:: tensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
transferAgent.h
|
||||
@ -52,3 +22,33 @@ _______________
|
||||
.. doxygenfile:: transferAgent.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
serialization.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: serialization.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
types.h
|
||||
_______
|
||||
|
||||
.. doxygenfile:: types.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
executor.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: executor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
dataTransceiverState.h
|
||||
______________________
|
||||
|
||||
.. doxygenfile:: dataTransceiverState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cacheCommunicator.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: cacheCommunicator.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
|
||||
@ -4,100 +4,10 @@ Runtime
|
||||
.. Here are files in the cpp/include/runtime
|
||||
.. We manually add subsection to enable detailed description in the future
|
||||
.. It is also doable to automatically generate this file and list all the modules in the conf.py
|
||||
gptJsonConfig.h
|
||||
_______________
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: gptJsonConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
tllmLogger.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: tllmLogger.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
worldConfig.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: worldConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
common.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: common.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcUtils.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: ipcUtils.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iGptDecoderBatched.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: iGptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
samplingConfig.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: samplingConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingMode.h
|
||||
_________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingMode.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
memoryCounters.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: memoryCounters.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingOutput.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: decodingOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decoderState.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: decoderState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoder.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: gptDecoder.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
explicitDraftTokensBuffers.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: explicitDraftTokensBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingInput.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: decodingInput.h
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadModule.h
|
||||
@ -106,40 +16,10 @@ _________________
|
||||
.. doxygenfile:: lookaheadModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
bufferManager.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: bufferManager.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
rawEngine.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: rawEngine.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraModule.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: loraModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
request.h
|
||||
iBuffer.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: request.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaStream.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: cudaStream.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaEvent.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: cudaEvent.h
|
||||
.. doxygenfile:: iBuffer.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
modelConfig.h
|
||||
@ -148,52 +28,10 @@ _____________
|
||||
.. doxygenfile:: modelConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcNvlsMemory.h
|
||||
_______________
|
||||
decodingOutput.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: ipcNvlsMemory.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iTensor.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iTensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoderBatched.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: gptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleModule.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: eagleModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCache.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: loraCache.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCachePageManagerConfig.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: loraCachePageManagerConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingModule.h
|
||||
___________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
.. doxygenfile:: decodingOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
promptTuningParams.h
|
||||
@ -202,15 +40,177 @@ ____________________
|
||||
.. doxygenfile:: promptTuningParams.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
bufferManager.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: bufferManager.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptJsonConfig.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: gptJsonConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCache.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: loraCache.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
rawEngine.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: rawEngine.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoder.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: gptDecoder.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
medusaModule.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: medusaModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iBuffer.h
|
||||
_________
|
||||
explicitDraftTokensBuffers.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: iBuffer.h
|
||||
.. doxygenfile:: explicitDraftTokensBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iTensor.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: iTensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
common.h
|
||||
________
|
||||
|
||||
.. doxygenfile:: common.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraCachePageManagerConfig.h
|
||||
____________________________
|
||||
|
||||
.. doxygenfile:: loraCachePageManagerConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
worldConfig.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: worldConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
loraModule.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: loraModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingMode.h
|
||||
_________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingMode.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaEvent.h
|
||||
___________
|
||||
|
||||
.. doxygenfile:: cudaEvent.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decodingInput.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: decodingInput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
speculativeDecodingModule.h
|
||||
___________________________
|
||||
|
||||
.. doxygenfile:: speculativeDecodingModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
iGptDecoderBatched.h
|
||||
____________________
|
||||
|
||||
.. doxygenfile:: iGptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleModule.h
|
||||
_____________
|
||||
|
||||
.. doxygenfile:: eagleModule.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
tllmLogger.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: tllmLogger.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
gptDecoderBatched.h
|
||||
___________________
|
||||
|
||||
.. doxygenfile:: gptDecoderBatched.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
cudaStream.h
|
||||
____________
|
||||
|
||||
.. doxygenfile:: cudaStream.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcNvlsMemory.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: ipcNvlsMemory.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
samplingConfig.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: samplingConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
request.h
|
||||
_________
|
||||
|
||||
.. doxygenfile:: request.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
decoderState.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: decoderState.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
ipcUtils.h
|
||||
__________
|
||||
|
||||
.. doxygenfile:: ipcUtils.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
memoryCounters.h
|
||||
________________
|
||||
|
||||
.. doxygenfile:: memoryCounters.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
|
||||
@ -1,81 +1,22 @@
|
||||
(disaggregated-service)=
|
||||
|
||||
# Disaggregated-Service (experimental)
|
||||
|
||||
# Disaggregated-Service (Experimental)
|
||||
|
||||
```{note}
|
||||
Note:
|
||||
This feature is currently experimental, and the related API is subjected to change in future versions.
|
||||
```
|
||||
|
||||
Currently TRT-LLM supports `disaggregated-service`, where the context and generation phases of a request can run on different executors. TRT-LLM's disaggregated service relies on the executor API, please make sure to read the [executor page](executor.md) before reading the document.
|
||||
|
||||
For more information on disaggregated service in LLM inference, one can refer to papers such as [DistServe](https://arxiv.org/abs/2401.09670), [SplitWise](https://arxiv.org/abs/2311.18677).
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
```cpp
|
||||
enum class RequestType
|
||||
{
|
||||
REQUEST_TYPE_CONTEXT_AND_GENERATION = 0,
|
||||
REQUEST_TYPE_CONTEXT_ONLY = 1,
|
||||
REQUEST_TYPE_GENERATION_ONLY = 2
|
||||
};
|
||||
```
|
||||
The TRT-LLM executor can execute three types of requests: `REQUEST_TYPE_CONTEXT_AND_GENERATION`, `REQUEST_TYPE_CONTEXT_ONLY`, and `REQUEST_TYPE_GENERATION_ONLY`. An executor instance could execute the context phase of the context-only request or the generation phase of the generation-only request. When the executor completes the context phase of a context-only request, it maintains the corresponding KV cache, which will be requested by the executor for the subsequent generation-only request.
|
||||
|
||||
Note that the environment variable `TRTLLM_USE_MPI_KVCACHE=1` should be set for `disaggregated-service`.
|
||||
|
||||
|
||||
Here are some key APIs to use disaggregated service:
|
||||
```cpp
|
||||
|
||||
Request request{...};
|
||||
|
||||
request.setRequestType(tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY);
|
||||
|
||||
auto contextRequestId = contextExecutor.enqueueRequest(request);
|
||||
|
||||
auto contextResponses = contextExecutor.awaitResponses(contextRequestId);
|
||||
|
||||
auto contextPhaseParams = contextResponses.back().getResult().contextPhaseParams.value();
|
||||
|
||||
request.setContextPhaseParams(contextPhaseParams);
|
||||
|
||||
request.setRequestType(tensorrt_llm::executor::RequestType::REQUEST_TYPE_GENERATION_ONLY);
|
||||
|
||||
auto generationRequestId = generationExecutor.enqueueRequest(request);
|
||||
|
||||
auto genResponses = generationExecutor.awaitResponses(generationRequestId);
|
||||
|
||||
```
|
||||
|
||||
The generationExecutor will require data such as KV cache from the corresponding contextExecutor based on the `contextPhaseParams` attached to the request, so please make sure that the corresponding contextExecutor is not shut down before getting the generationExecutor's response.
|
||||
|
||||
In the code example above, the `contextRequestId` assigned by the contextExecutor and the `generationRequestId` assigned by the generationExecutor are independent, it is the user's responsibility to manage the mapping of the `requestId` for context-only requests to the `requestId` for generation-only requests. The `contextResponses` contains the first output token generated by the context phase, and the `genResponses` also contains the first output token generated by the contextExecutor, so all output tokens can be obtained from generationExecutor's responses.
|
||||
|
||||
|
||||

|
||||
|
||||
An `orchestrator` is required in `disaggregated-service` to manage multiple executor instances and route requests to different executors, TRT-LLM provides class `DisaggExecutorOrchestrator` in `cpp/include/tensorrt_llm/executor/disaggServerUtil.h` to launch multiple executor instances, however, `DisaggExecutorOrchestrator` only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their usage scenario.
|
||||
|
||||
|
||||
## Example
|
||||
|
||||
Please refer to `examples/cpp/executor/executorExampleDisaggregated.cpp`
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Please refer to `benchmarks/cpp/disaggServerBenchmark.cpp` and `benchmarks/cpp/README.md`
|
||||
An [architectural and performance overview](../../../docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md), as well as [usage examples](../../../examples/disaggregated/README.md), are provided.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
TRT-LLM uses some environment variables to control the behavior of disaggregated service.
|
||||
|
||||
* `TRTLLM_USE_MPI_KVCACHE`: Whether to use MPI to transfer KV cache. Currently, the default value is `0`.
|
||||
|
||||
* `TRTLLM_USE_UCX_KVCACHE`: Whether to use UCX to transfer KV cache. Currently, the default value is `0`. To use disaggregated service, either `TRTLLM_USE_MPI_KVCACHE=1` or `TRTLLM_USE_UCX_KVCACHE=1` is required to be set.
|
||||
* `TRTLLM_USE_UCX_KVCACHE`: Specifies whether to use UCX for KV cache transfer. The default value is `0`. This must be enabled when using a disaggregated service.
|
||||
|
||||
* `TRTLLM_PARALLEL_CACHE_SEND`: If set to `1`, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is `0`.
|
||||
|
||||
@ -93,7 +34,6 @@ TRT-LLM uses some environment variables to control the behavior of disaggregated
|
||||
|
||||
* `TRTLLM_KVCACHE_SEND_MAX_CONCURRENCY_NUM`: The maximum number of concurrent KV cache sends. The default value is `4`. This environment variable only takes effect when `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE` is greater than 0.
|
||||
|
||||
|
||||
## Troubleshooting and FAQ
|
||||
|
||||
### General FAQs
|
||||
@ -122,28 +62,18 @@ A. Yes, but it's not recommended, TRT-LLM does not implement proper scheduling f
|
||||
|
||||
A. Yes, it's recommended that different executor use different GPUs . We support context-only executor and genertion-only executor run on same node or different nodes. The `participantIds` and `deviceIds` used by each executor need to be explicitly set by the user, and the `participantIds` of each executor must not be intersecting.
|
||||
|
||||
*Q. What's the requirement for disaggregated-service in TRT-LLM?*
|
||||
|
||||
A. TRT-LLM requires `UCX`-backend `CUDA-aware MPI` currently, TRT-LLM implements KV cache transfer with [`CUDA-aware MPI`](https://docs.open-mpi.org/en/v5.0.x/tuning-apps/networking/cuda.html#how-do-i-build-open-mpi-with-cuda-aware-support), and will support more communication components for KV cache transfer in future version.
|
||||
|
||||
### Debugging FAQs
|
||||
|
||||
*Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`*
|
||||
|
||||
A. please set the environment variables
|
||||
```
|
||||
export TRTLLM_USE_MPI_KVCACHE=1
|
||||
```
|
||||
or
|
||||
A. Please set the environment variables
|
||||
```
|
||||
export TRTLLM_USE_UCX_KVCACHE=1
|
||||
```
|
||||
When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will transfer the KV cache using `CUDA-aware MPI`. All executor processes involved must share the same MPI world communicator. Consequently, with `TRTLLM_USE_MPI_KVCACHE=1`, TRT-LLM only supports launching multiple executors via `MPI`. Additionally, the `CommunicationMode` for the executors must be set to `kLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for the `disaggregated-service`. These restrictions do not apply when `TRTLLM_USE_UCX_KVCACHE=1` is set.
|
||||
|
||||
|
||||
*Q. Why do some profiling tools show that TRT-LLM's KV cache transfer does not utilize NVLink even on devices equipped with NVLink?*
|
||||
|
||||
A. Ensure TRT-LLM is running with `UCX`-backend `CUDA-aware MPI` , and check version of `UCX` with `ucx_info -v`.
|
||||
A. Please check version of `UCX` with `ucx_info -v`.
|
||||
If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
|
||||
If the version of UCX >=1.18, there are several ways to enable NVLink:
|
||||
1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda`, `UCX_CUDA_COPY_DMABUF=no`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
|
||||
@ -154,7 +84,6 @@ If the version of UCX >=1.18, there are several ways to enable NVLink:
|
||||
A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer, but it is not enabled by default. There are several ways to enable GPU direct RDMA:
|
||||
1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_RNDV_FRAG_MEM_TYPE=cuda`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
|
||||
2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
|
||||
To achieve the optimal performance when using GPU direct RDMA, it is advisable to create CUDA context before MPI initialization when TRTLLM_USE_MPI_KVCACHE=1 is set. One possible approach is to rely on MPI environment variables to set the correct device before MPI initialization.
|
||||
|
||||
*Q. Are there any guidelines for performance tuning of KV cache transfer?*
|
||||
|
||||
|
||||
@ -195,20 +195,20 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
|
||||
#### Benchmark
|
||||
```bash
|
||||
cat >./extra-llm-api-config.yml <<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_padding_enabled: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 896
|
||||
- 512
|
||||
- 256
|
||||
- 128
|
||||
- 64
|
||||
- 32
|
||||
- 16
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
- 1
|
||||
cuda_graph_config:
|
||||
padding_enabled: true
|
||||
batch_sizes:
|
||||
- 896
|
||||
- 512
|
||||
- 256
|
||||
- 128
|
||||
- 64
|
||||
- 32
|
||||
- 16
|
||||
- 8
|
||||
- 4
|
||||
- 2
|
||||
- 1
|
||||
print_iter_log: true
|
||||
kv_cache_dtype: fp8
|
||||
enable_attention_dp: true
|
||||
@ -262,19 +262,19 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
|
||||
YOUR_DATA_PATH=./dataset.txt
|
||||
|
||||
cat >./extra-llm-api-config.yml <<EOF
|
||||
use_cuda_graph: true
|
||||
cuda_graph_padding_enabled: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 384
|
||||
cuda_graph_config:
|
||||
padding_enabled: true
|
||||
batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 384
|
||||
print_iter_log: ${PRINT_ITER_LOG}
|
||||
enable_attention_dp: true
|
||||
EOF
|
||||
|
||||
@ -110,10 +110,10 @@ The MTP module follows the design in DeepSeek-V3. The embedding layer and output
|
||||
Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT-LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance.
|
||||
|
||||
### How to run DeepSeek models with MTP
|
||||
Run DeepSeek-V3/R1 models with MTP, use [examples/pytorch/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py) with additional options:
|
||||
Run DeepSeek-V3/R1 models with MTP, use [examples/llm-api/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/quickstart_advanced.py) with additional options:
|
||||
|
||||
```bash
|
||||
cd examples/pytorch
|
||||
cd examples/llm-api
|
||||
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N
|
||||
```
|
||||
|
||||
@ -165,10 +165,10 @@ Note that the Relaxed Acceptance will only be used during the thinking phase, wh
|
||||
|
||||
### How to run the DeepSeek-R1 model with Relaxed Acceptance
|
||||
|
||||
Run DeepSeek-R1 models with MTP Relaxed Acceptance, use [examples/pytorch/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py) with additional options:
|
||||
Run DeepSeek-R1 models with MTP Relaxed Acceptance, use [examples/llm-api/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/quickstart_advanced.py) with additional options:
|
||||
|
||||
```bash
|
||||
cd examples/pytorch
|
||||
cd examples/llm-api
|
||||
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N --use_relaxed_acceptance_for_thinking --relaxed_topk 10 --relaxed_delta 0.6
|
||||
```
|
||||
|
||||
|
||||
@ -151,7 +151,13 @@ These optimizations target the overall execution flow, scheduling, and resource
|
||||
|
||||
* CUDA Graph
|
||||
|
||||
This had a significant **22% E2E performance impact** for throughput scenarios. CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed. There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_padding_enabled` to false, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
|
||||
This had a significant **22% E2E performance impact** for throughput scenarios.
|
||||
|
||||
CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed.
|
||||
|
||||
There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.
|
||||
|
||||
Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n padding_enabled: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
|
||||
|
||||
* Overlap Scheduler:
|
||||
|
||||
|
||||
@ -18,6 +18,8 @@ By NVIDIA TensorRT-LLM Team
|
||||
- [ISL 4400 - OSL 1200 (Machine Translation Dataset)](#ISL-4400---OSL-1200-Machine-Translation-Dataset)
|
||||
- [ISL 8192 - OSL 256 (Synthetic Dataset)](#ISL-8192---OSL-256-Synthetic-Dataset)
|
||||
- [ISL 4096 - OSL 1024 (Machine Translation Dataset)](#ISL-4096---OSL-1024-Machine-Translation-Dataset)
|
||||
- [Qwen 3](#Qwen-3)
|
||||
- [ISL 8192 - OSL 1024 (Machine Translation Dataset)](#ISL-8192---OSL-1024-Machine-Translation-Dataset)
|
||||
- [Reproducing Steps](#Reproducing-Steps)
|
||||
- [Future Work](#Future-Work)
|
||||
- [Acknowledgement](#Acknowledgement)
|
||||
@ -260,6 +262,19 @@ In Figure 13 and 14, the E2E Pareto curves for aggregated serving and disaggrega
|
||||
|
||||
For Pareto curves with MTP = 1, 2, 3, it can be observed that disaggregated results show a **1.7x** improvement over aggregated results at 50 tokens/sec/user (20 ms latency). Enabling MTP provides a larger speedup at higher concurrencies.
|
||||
|
||||
### Qwen 3
|
||||
|
||||
#### ISL 8192 - OSL 1024 (Machine Translation Dataset)
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture15.png" width="640" height="auto" alt="Qwen 3 Pareto curves">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 15. Qwen 3 Pareto curves.</em></sub></p>
|
||||
|
||||
We also conducted performance evaluations of Qwen 3 on GB200 GPUs. The data indicate that the speedups achieved by disaggregation over aggregation range from 1.7x to 6.11x.
|
||||
|
||||
### Reproducing Steps
|
||||
|
||||
We provide a set of scripts to reproduce the performance data presented in this paper. Please refer to the usage instructions described in [this document](https://github.com/NVIDIA/TensorRT-LLM/tree/main/docs/source/scripts/disaggregated).
|
||||
|
||||
@ -0,0 +1,148 @@
|
||||
# How to launch Llama4 Maverick + Eagle3 TensorRT-LLM server
|
||||
|
||||
Artificial Analysis has benchmarked the Llama4 Maverick with Eagle3 enabled TensorRT-LLM server running at over [1000 tokens per second per user on 8xB200 GPUs](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/). This implementation leverages NVIDIA's TensorRT-LLM combined with speculative decoding using the Eagle3 model to further boost performance.
|
||||
|
||||
In the guide below, we will walk you through how to launch your own high-performance Llama4 Maverick with Eagle3 enabled TensorRT-LLM server, from build to deployment. (Note that your specific performance numbers may vary—speculative decoding speedups depend upon the dataset!)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- 8x NVIDIA B200 GPUs in a single node (we have a forthcoming guide for getting great performance on H100)
|
||||
- CUDA Toolkit 12.8 or later
|
||||
- Docker with NVIDIA Container Toolkit installed
|
||||
- Fast SSD storage for model weights
|
||||
- Access to Llama4 Maverick and Eagle3 model checkpoints
|
||||
- A love of speed
|
||||
|
||||
## Download Artifacts
|
||||
|
||||
* [NVIDIA Llama 4 Maverick 17B 128E Instruct FP8](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8)
|
||||
* [NVIDIA Llama 4 Maverick 17B 128E Eagle3 BF16](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Eagle3)
|
||||
|
||||
In [Step 4: Start the TensorRT-LLM server](#step-4-start-the-tensorrt-llm-server), `/path/to/maverick` and `/path/to/eagle` refer to the download paths of the above respective models.
|
||||
|
||||
## Launching the server
|
||||
|
||||
### Step 1: Clone the repository
|
||||
|
||||
```
|
||||
git clone https://github.com/NVIDIA/TensorRT-LLM.git
|
||||
cd TensorRT-LLM
|
||||
git submodule update --init --recursive
|
||||
git lfs pull
|
||||
```
|
||||
|
||||
The last command, `git lfs pull`, ensures all large files stored with Git LFS are properly downloaded. If `git lfs` is not installed, please install following [Install Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage)
|
||||
|
||||
### Step 2: Prepare the TensorRT-LLM release Docker image
|
||||
|
||||
|
||||
#### Option 1. Use weekly release NGC docker image
|
||||
TensorRT-LLM provides weekly release [docker image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release)
|
||||
|
||||
#### Option 2. Build TensorRT-LLM Docker image (Alternative way)
|
||||
If you want to compile a specific TensorRT-LLM commit, you can build the docker image by checking out the specific branch or commit and running a make command. This may take 15-30 minutes depending on your system.
|
||||
|
||||
```
|
||||
make -C docker release_build
|
||||
```
|
||||
|
||||
### Step 3: (Optional) Tag and push the Docker image to your registry
|
||||
|
||||
If you want to use this image on multiple machines or in a cluster:
|
||||
|
||||
```
|
||||
docker tag tensorrt_llm/release:latest docker.io/<username>/tensorrt_llm:main
|
||||
docker push docker.io/<username>/tensorrt_llm:main
|
||||
```
|
||||
|
||||
Replace `<username>` with your Docker Hub username or your private registry path.
|
||||
|
||||
### Step 4: Start the TensorRT-LLM server
|
||||
|
||||
This command launches the server with Llama4 Maverick as the main model and Eagle3 as the draft model for speculative decoding. Make sure you have downloaded both model checkpoints before running this command.
|
||||
|
||||
**Important:** Replace `/path/to/maverick` and `/path/to/eagle` with the actual paths to your Maverick and Eagle3 model checkpoints on your host machine, downloaded in the [Download Artifacts](#download-artifacts) stage
|
||||
|
||||
```
|
||||
docker run -d --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
|
||||
-p 8000:8000 --gpus=all -e "TRTLLM_ENABLE_PDL=1" \
|
||||
-v /path/to/maverick:/config/models/maverick -v /path/to/eagle:/config/models/eagle \
|
||||
docker.io/<username>/tensorrt_llm:main sh \
|
||||
-c "echo -e 'enable_attention_dp: false\nenable_min_latency: true\nenable_autotuner: false\ncuda_graph_config:\n max_batch_size: 8\nspeculative_config:\n decoding_type: Eagle\n max_draft_len: 3\n speculative_model_dir: /config/models/eagle\nkv_cache_config:\n enable_block_reuse: false' > c.yaml && \
|
||||
TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True \
|
||||
trtllm-serve /config/models/maverick \
|
||||
--host 0.0.0.0 --port 8000 \
|
||||
--backend pytorch --tp_size 8 --ep_size 1 \
|
||||
--trust_remote_code --extra_llm_api_options c.yaml \
|
||||
--kv_cache_free_gpu_memory_fraction 0.75"
|
||||
```
|
||||
|
||||
This command:
|
||||
- Runs the container in detached mode (`-d`)
|
||||
- Sets up shared memory and stack limits for optimal performance
|
||||
- Maps port 8000 from the container to your host
|
||||
- Enables all GPUs with tensor parallelism across all 8 GPUs
|
||||
- Creates a configuration file for speculative decoding with Eagle3
|
||||
- Configures memory settings for optimal throughput
|
||||
|
||||
After running this command, the server will initialize, which may take several minutes as it loads and optimizes the models.
|
||||
|
||||
You can query the health/readiness of the server using
|
||||
```
|
||||
curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/health"
|
||||
```
|
||||
|
||||
When the 200 code is returned the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
|
||||
|
||||
### Step 5: Test the server with a sample request
|
||||
|
||||
Once the server is running, you can test it with a simple curl request:
|
||||
|
||||
```
|
||||
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "Llama4-eagle",
|
||||
"messages": [{"role": "user", "content": "Why is NVIDIA a great company?"}],
|
||||
"max_tokens": 1024
|
||||
}' -w "\n"
|
||||
|
||||
# {"id":"chatcmpl-e752184d1181494c940579c007ab2c5f","object":"chat.completion","created":1748018634,"model":"Llama4-eagle","choices":[{"index":0,"message":{"role":"assistant","content":"NVIDIA is considered a great company for several reasons:\n\n1. **Innovative Technology**: NVIDIA is a leader in the development of graphics processing units (GPUs) and high-performance computing hardware. Their GPUs are used in a wide range of applications, from gaming and professional visualization to artificial intelligence (AI), deep learning, and autonomous vehicles.\n2. ...","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":17,"total_tokens":552,"completion_tokens":535}}
|
||||
```
|
||||
|
||||
The server exposes a standard OpenAI-compatible API endpoint that accepts JSON requests. You can adjust parameters like `max_tokens`, `temperature`, and others according to your needs.
|
||||
|
||||
|
||||
### Step 6: (Optional) Monitor server logs
|
||||
|
||||
To view the logs of the running container:
|
||||
|
||||
```
|
||||
docker ps # get the container id
|
||||
docker logs -f <container_id>
|
||||
```
|
||||
|
||||
This is useful for troubleshooting or monitoring performance statistics reported by the server.
|
||||
|
||||
### Step 7: (Optional) Stop the server
|
||||
|
||||
When you're done with the server:
|
||||
|
||||
```
|
||||
docker ps # get the container id
|
||||
docker kill <container_id>
|
||||
```
|
||||
|
||||
## Troubleshooting Tips
|
||||
|
||||
- If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`
|
||||
- Ensure your model checkpoints are compatible with the expected format
|
||||
- For performance issues, check GPU utilization with `nvidia-smi` while the server is running
|
||||
- If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed
|
||||
- For connection issues, make sure port 8000 is not being used by another application
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
The configuration provided is optimized for 8xB200 GPUs, but you can adjust several parameters for your specific workload:
|
||||
|
||||
- `max_batch_size`: Controls how many requests can be batched together
|
||||
- `max_draft_len`: The number of tokens Eagle can speculate ahead
|
||||
- `kv_cache_free_gpu_memory_fraction`: Controls memory allocation for the KV cache
|
||||
@ -21,6 +21,7 @@ _____________
|
||||
llm_guided_decoding
|
||||
llm_logits_processor
|
||||
llm_multilora
|
||||
llm_speculative_decoding
|
||||
|
||||
Slurm
|
||||
_____
|
||||
|
||||
@ -3,6 +3,6 @@ Generate text
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_inference.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_inference.py
|
||||
:lines: 4-37
|
||||
:lines: 4-35
|
||||
:language: python
|
||||
:linenos:
|
||||
|
||||
@ -3,6 +3,6 @@ Generate text asynchronously
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_inference_async.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_inference_async.py
|
||||
:lines: 4-45
|
||||
:lines: 4-43
|
||||
:language: python
|
||||
:linenos:
|
||||
|
||||
@ -3,6 +3,6 @@ Generate text in streaming
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_inference_async_streaming.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_inference_async_streaming.py
|
||||
:lines: 4-65
|
||||
:lines: 4-64
|
||||
:language: python
|
||||
:linenos:
|
||||
|
||||
@ -3,6 +3,6 @@ Distributed LLM Generation
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_inference_distributed.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_inference_distributed.py
|
||||
:lines: 4-46
|
||||
:lines: 4-44
|
||||
:language: python
|
||||
:linenos:
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
Speculative Decoding
|
||||
====================
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_speculative_decoding.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_speculative_decoding.py
|
||||
:lines: 4-92
|
||||
:language: python
|
||||
:linenos:
|
||||
@ -2,28 +2,11 @@
|
||||
|
||||
The LLM API is a high-level Python API designed to streamline LLM inference workflows.
|
||||
|
||||
It supports a broad range of use cases, from single-GPU setups to multi-GPU and multi-node deployments, with built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server).
|
||||
It supports a broad range of use cases, from single-GPU setups to multi-GPU and multi-node deployments, with built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo).
|
||||
|
||||
While the LLM API simplifies inference workflows with a high-level interface, it is also designed with flexibility in mind. Under the hood, it uses a PyTorch-native and modular backend, making it easy to customize, extend, or experiment with the runtime.
|
||||
|
||||
|
||||
## Supported Models
|
||||
|
||||
* DeepSeek variants
|
||||
* Llama (including variants Mistral, Mixtral, InternLM)
|
||||
* GPT (including variants Starcoder-1/2, Santacoder)
|
||||
* Gemma-1/2/3
|
||||
* Phi-1/2/3/4
|
||||
* ChatGLM (including variants glm-10b, chatglm, chatglm2, chatglm3, glm4)
|
||||
* QWen-1/1.5/2/3
|
||||
* Falcon
|
||||
* Baichuan-1/2
|
||||
* GPT-J
|
||||
* Mamba-1/2
|
||||
|
||||
|
||||
> **Note:** For the most up-to-date list of supported models, you may refer to the [TensorRT-LLM model definitions](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/_torch/models).
|
||||
|
||||
## Quick Start Example
|
||||
A simple inference example with TinyLlama using the LLM API:
|
||||
|
||||
@ -31,7 +14,8 @@ A simple inference example with TinyLlama using the LLM API:
|
||||
:language: python
|
||||
:linenos:
|
||||
```
|
||||
More examples can be found [here]().
|
||||
|
||||
For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).
|
||||
|
||||
## Model Input
|
||||
|
||||
@ -65,7 +49,6 @@ llm = LLM(model=<local_path_to_model>)
|
||||
> **Note:** Some models require accepting specific [license agreements]((https://ai.meta.com/resources/models-and-libraries/llama-downloads/)). Make sure you have agreed to the terms and authenticated with Hugging Face before downloading.
|
||||
|
||||
|
||||
|
||||
## Tips and Troubleshooting
|
||||
|
||||
The following tips typically assist new LLM API users who are familiar with other APIs that are part of TensorRT-LLM:
|
||||
|
||||
@ -169,6 +169,12 @@ API Reference
|
||||
:show-inheritance:
|
||||
:special-members: __init__
|
||||
|
||||
.. autoclass:: tensorrt_llm.llmapi.UserProvidedDecodingConfig
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:special-members: __init__
|
||||
|
||||
.. autoclass:: tensorrt_llm.llmapi.TorchCompileConfig
|
||||
:members:
|
||||
:undoc-members:
|
||||
|
||||
@ -196,8 +196,8 @@ if __name__ == '__main__':
|
||||
main()
|
||||
```
|
||||
|
||||
We provide an out-of-tree modeling example in `examples/pytorch/out_of_tree_example`. The model is implemented in `modeling_opt.py` and you can run the example by:
|
||||
We provide an out-of-tree modeling example in `examples/llm-api/out_of_tree_example`. The model is implemented in `modeling_opt.py` and you can run the example by:
|
||||
|
||||
```bash
|
||||
python examples/pytorch/out_of_tree_example/main.py
|
||||
python examples/llm-api/out_of_tree_example/main.py
|
||||
```
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
|
||||
<title>Disaggregated-Service (experimental) — TensorRT-LLM</title>
|
||||
<title>Disaggregated-Service (Experimental) — TensorRT-LLM</title>
|
||||
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -70,7 +70,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -348,6 +348,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -408,7 +409,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -477,7 +478,7 @@
|
||||
<i class="fa-solid fa-home"></i>
|
||||
</a>
|
||||
</li>
|
||||
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Disaggregated-Service (experimental)</span></li>
|
||||
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Disaggregated-Service (Experimental)</span></li>
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
@ -495,7 +496,7 @@
|
||||
<article class="bd-article">
|
||||
|
||||
<section id="disaggregated-service-experimental">
|
||||
<span id="disaggregated-service"></span><h1>Disaggregated-Service (experimental)<a class="headerlink" href="#disaggregated-service-experimental" title="Link to this heading">#</a></h1>
|
||||
<span id="disaggregated-service"></span><h1>Disaggregated-Service (Experimental)<a class="headerlink" href="#disaggregated-service-experimental" title="Link to this heading">#</a></h1>
|
||||
<div class="admonition note">
|
||||
<p class="admonition-title">Note</p>
|
||||
<p>Note:
|
||||
@ -503,57 +504,12 @@ This feature is currently experimental, and the related API is subjected to chan
|
||||
</div>
|
||||
<p>Currently TRT-LLM supports <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code>, where the context and generation phases of a request can run on different executors. TRT-LLM’s disaggregated service relies on the executor API, please make sure to read the <a class="reference internal" href="executor.html"><span class="std std-doc">executor page</span></a> before reading the document.</p>
|
||||
<p>For more information on disaggregated service in LLM inference, one can refer to papers such as <a class="reference external" href="https://arxiv.org/abs/2401.09670">DistServe</a>, <a class="reference external" href="https://arxiv.org/abs/2311.18677">SplitWise</a>.</p>
|
||||
<section id="usage">
|
||||
<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading">#</a></h2>
|
||||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">enum</span><span class="w"> </span><span class="k">class</span><span class="w"> </span><span class="nc">RequestType</span>
|
||||
<span class="p">{</span>
|
||||
<span class="w"> </span><span class="n">REQUEST_TYPE_CONTEXT_AND_GENERATION</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="n">REQUEST_TYPE_CONTEXT_ONLY</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="n">REQUEST_TYPE_GENERATION_ONLY</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">2</span>
|
||||
<span class="p">};</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The TRT-LLM executor can execute three types of requests: <code class="docutils literal notranslate"><span class="pre">REQUEST_TYPE_CONTEXT_AND_GENERATION</span></code>, <code class="docutils literal notranslate"><span class="pre">REQUEST_TYPE_CONTEXT_ONLY</span></code>, and <code class="docutils literal notranslate"><span class="pre">REQUEST_TYPE_GENERATION_ONLY</span></code>. An executor instance could execute the context phase of the context-only request or the generation phase of the generation-only request. When the executor completes the context phase of a context-only request, it maintains the corresponding KV cache, which will be requested by the executor for the subsequent generation-only request.</p>
|
||||
<p>Note that the environment variable <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_MPI_KVCACHE=1</span></code> should be set for <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code>.</p>
|
||||
<p>Here are some key APIs to use disaggregated service:</p>
|
||||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">Request</span><span class="w"> </span><span class="n">request</span><span class="p">{...};</span>
|
||||
|
||||
<span class="n">request</span><span class="p">.</span><span class="n">setRequestType</span><span class="p">(</span><span class="n">tensorrt_llm</span><span class="o">::</span><span class="n">executor</span><span class="o">::</span><span class="n">RequestType</span><span class="o">::</span><span class="n">REQUEST_TYPE_CONTEXT_ONLY</span><span class="p">);</span>
|
||||
|
||||
<span class="k">auto</span><span class="w"> </span><span class="n">contextRequestId</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">contextExecutor</span><span class="p">.</span><span class="n">enqueueRequest</span><span class="p">(</span><span class="n">request</span><span class="p">);</span>
|
||||
|
||||
<span class="k">auto</span><span class="w"> </span><span class="n">contextResponses</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">contextExecutor</span><span class="p">.</span><span class="n">awaitResponses</span><span class="p">(</span><span class="n">contextRequestId</span><span class="p">);</span>
|
||||
|
||||
<span class="k">auto</span><span class="w"> </span><span class="n">contextPhaseParams</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">contextResponses</span><span class="p">.</span><span class="n">back</span><span class="p">().</span><span class="n">getResult</span><span class="p">().</span><span class="n">contextPhaseParams</span><span class="p">.</span><span class="n">value</span><span class="p">();</span>
|
||||
|
||||
<span class="n">request</span><span class="p">.</span><span class="n">setContextPhaseParams</span><span class="p">(</span><span class="n">contextPhaseParams</span><span class="p">);</span>
|
||||
|
||||
<span class="n">request</span><span class="p">.</span><span class="n">setRequestType</span><span class="p">(</span><span class="n">tensorrt_llm</span><span class="o">::</span><span class="n">executor</span><span class="o">::</span><span class="n">RequestType</span><span class="o">::</span><span class="n">REQUEST_TYPE_GENERATION_ONLY</span><span class="p">);</span>
|
||||
|
||||
<span class="k">auto</span><span class="w"> </span><span class="n">generationRequestId</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">generationExecutor</span><span class="p">.</span><span class="n">enqueueRequest</span><span class="p">(</span><span class="n">request</span><span class="p">);</span>
|
||||
|
||||
<span class="k">auto</span><span class="w"> </span><span class="n">genResponses</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">generationExecutor</span><span class="p">.</span><span class="n">awaitResponses</span><span class="p">(</span><span class="n">generationRequestId</span><span class="p">);</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The generationExecutor will require data such as KV cache from the corresponding contextExecutor based on the <code class="docutils literal notranslate"><span class="pre">contextPhaseParams</span></code> attached to the request, so please make sure that the corresponding contextExecutor is not shut down before getting the generationExecutor’s response.</p>
|
||||
<p>In the code example above, the <code class="docutils literal notranslate"><span class="pre">contextRequestId</span></code> assigned by the contextExecutor and the <code class="docutils literal notranslate"><span class="pre">generationRequestId</span></code> assigned by the generationExecutor are independent, it is the user’s responsibility to manage the mapping of the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> for context-only requests to the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> for generation-only requests. The <code class="docutils literal notranslate"><span class="pre">contextResponses</span></code> contains the first output token generated by the context phase, and the <code class="docutils literal notranslate"><span class="pre">genResponses</span></code> also contains the first output token generated by the contextExecutor, so all output tokens can be obtained from generationExecutor’s responses.</p>
|
||||
<p><img alt="disaggregated-service usage" src="../_images/disaggregated-service_usage.png" /></p>
|
||||
<p>An <code class="docutils literal notranslate"><span class="pre">orchestrator</span></code> is required in <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code> to manage multiple executor instances and route requests to different executors, TRT-LLM provides class <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> in <code class="docutils literal notranslate"><span class="pre">cpp/include/tensorrt_llm/executor/disaggServerUtil.h</span></code> to launch multiple executor instances, however, <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their usage scenario.</p>
|
||||
</section>
|
||||
<section id="example">
|
||||
<h2>Example<a class="headerlink" href="#example" title="Link to this heading">#</a></h2>
|
||||
<p>Please refer to <code class="docutils literal notranslate"><span class="pre">examples/cpp/executor/executorExampleDisaggregated.cpp</span></code></p>
|
||||
</section>
|
||||
<section id="benchmarks">
|
||||
<h2>Benchmarks<a class="headerlink" href="#benchmarks" title="Link to this heading">#</a></h2>
|
||||
<p>Please refer to <code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/disaggServerBenchmark.cpp</span></code> and <code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/README.md</span></code></p>
|
||||
</section>
|
||||
<p>An <a class="reference internal" href="../blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html"><span class="std std-doc">architectural and performance overview</span></a>, as well as <span class="xref myst">usage examples</span>, are provided.</p>
|
||||
<section id="environment-variables">
|
||||
<h2>Environment Variables<a class="headerlink" href="#environment-variables" title="Link to this heading">#</a></h2>
|
||||
<p>TRT-LLM uses some environment variables to control the behavior of disaggregated service.</p>
|
||||
<ul class="simple">
|
||||
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_MPI_KVCACHE</span></code>: Whether to use MPI to transfer KV cache. Currently, the default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
|
||||
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_UCX_KVCACHE</span></code>: Whether to use UCX to transfer KV cache. Currently, the default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>. To use disaggregated service, either <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_MPI_KVCACHE=1</span></code> or <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_UCX_KVCACHE=1</span></code> is required to be set.</p></li>
|
||||
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_UCX_KVCACHE</span></code>: Specifies whether to use UCX for KV cache transfer. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>. This must be enabled when using a disaggregated service.</p></li>
|
||||
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_PARALLEL_CACHE_SEND</span></code>: If set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
|
||||
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP</span></code>: If set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, generationExecutor will not overlap KV cache transfer with model inference. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
|
||||
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_ENABLE_KVCACHE_RECEIVE_PARALLEL</span></code>: When the generation rank receives KV cache from multiple context ranks within a single context instance, it will receive KV cache from each rank sequentially. If set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, the generation rank will receive KV cache from each rank within one context instance in parallel. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
|
||||
@ -580,23 +536,16 @@ This feature is currently experimental, and the related API is subjected to chan
|
||||
<p>A. Yes, but it’s not recommended, TRT-LLM does not implement proper scheduling for the case where the executor handles mixed context-only requests and generation-only requests, it’s better to run context-only requests and generation-only requests on different executors.</p>
|
||||
<p><em>Q. Does disaggregated-service in TRT-LLM support multi-gpu and multi-node?</em></p>
|
||||
<p>A. Yes, it’s recommended that different executor use different GPUs . We support context-only executor and genertion-only executor run on same node or different nodes. The <code class="docutils literal notranslate"><span class="pre">participantIds</span></code> and <code class="docutils literal notranslate"><span class="pre">deviceIds</span></code> used by each executor need to be explicitly set by the user, and the <code class="docutils literal notranslate"><span class="pre">participantIds</span></code> of each executor must not be intersecting.</p>
|
||||
<p><em>Q. What’s the requirement for disaggregated-service in TRT-LLM?</em></p>
|
||||
<p>A. TRT-LLM requires <code class="docutils literal notranslate"><span class="pre">UCX</span></code>-backend <code class="docutils literal notranslate"><span class="pre">CUDA-aware</span> <span class="pre">MPI</span></code> currently, TRT-LLM implements KV cache transfer with <a class="reference external" href="https://docs.open-mpi.org/en/v5.0.x/tuning-apps/networking/cuda.html#how-do-i-build-open-mpi-with-cuda-aware-support"><code class="docutils literal notranslate"><span class="pre">CUDA-aware</span> <span class="pre">MPI</span></code></a>, and will support more communication components for KV cache transfer in future version.</p>
|
||||
</section>
|
||||
<section id="debugging-faqs">
|
||||
<h3>Debugging FAQs<a class="headerlink" href="#debugging-faqs" title="Link to this heading">#</a></h3>
|
||||
<p><em>Q. How to handle error <code class="docutils literal notranslate"><span class="pre">Disaggregated</span> <span class="pre">serving</span> <span class="pre">is</span> <span class="pre">not</span> <span class="pre">enabled,</span> <span class="pre">please</span> <span class="pre">check</span> <span class="pre">the</span> <span class="pre">configuration?</span></code></em></p>
|
||||
<p>A. please set the environment variables</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">TRTLLM_USE_MPI_KVCACHE</span><span class="o">=</span><span class="mi">1</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>or</p>
|
||||
<p>A. Please set the environment variables</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">TRTLLM_USE_UCX_KVCACHE</span><span class="o">=</span><span class="mi">1</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>When the environment variable <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_MPI_KVCACHE=1</span></code> is set, TRT-LLM will transfer the KV cache using <code class="docutils literal notranslate"><span class="pre">CUDA-aware</span> <span class="pre">MPI</span></code>. All executor processes involved must share the same MPI world communicator. Consequently, with <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_MPI_KVCACHE=1</span></code>, TRT-LLM only supports launching multiple executors via <code class="docutils literal notranslate"><span class="pre">MPI</span></code>. Additionally, the <code class="docutils literal notranslate"><span class="pre">CommunicationMode</span></code> for the executors must be set to <code class="docutils literal notranslate"><span class="pre">kLEADER</span></code> or <code class="docutils literal notranslate"><span class="pre">kORCHESTRATOR</span></code> with <code class="docutils literal notranslate"><span class="pre">SpawnProcesses=false</span></code> for the <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code>. These restrictions do not apply when <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_UCX_KVCACHE=1</span></code> is set.</p>
|
||||
<p><em>Q. Why do some profiling tools show that TRT-LLM’s KV cache transfer does not utilize NVLink even on devices equipped with NVLink?</em></p>
|
||||
<p>A. Ensure TRT-LLM is running with <code class="docutils literal notranslate"><span class="pre">UCX</span></code>-backend <code class="docutils literal notranslate"><span class="pre">CUDA-aware</span> <span class="pre">MPI</span></code> , and check version of <code class="docutils literal notranslate"><span class="pre">UCX</span></code> with <code class="docutils literal notranslate"><span class="pre">ucx_info</span> <span class="pre">-v</span></code>.
|
||||
<p>A. Please check version of <code class="docutils literal notranslate"><span class="pre">UCX</span></code> with <code class="docutils literal notranslate"><span class="pre">ucx_info</span> <span class="pre">-v</span></code>.
|
||||
If the version of UCX <=1.17, set the environment variables <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_FRAG_MEM_TYPE=cuda</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
|
||||
If the version of UCX >=1.18, there are several ways to enable NVLink:</p>
|
||||
<ol class="arabic simple">
|
||||
@ -607,8 +556,7 @@ If the version of UCX >=1.18, there are several ways to enable NVLink:</p>
|
||||
<p>A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer, but it is not enabled by default. There are several ways to enable GPU direct RDMA:</p>
|
||||
<ol class="arabic simple">
|
||||
<li><p>Set the environment variables <code class="docutils literal notranslate"><span class="pre">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B</span></code>,<code class="docutils literal notranslate"><span class="pre">UCX_RNDV_FRAG_MEM_TYPE=cuda</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_PIPELINE_ERROR_HANDLING=y</span></code>.</p></li>
|
||||
<li><p>Set the environment variables <code class="docutils literal notranslate"><span class="pre">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_PIPELINE_ERROR_HANDLING=y</span></code>, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
|
||||
To achieve the optimal performance when using GPU direct RDMA, it is advisable to create CUDA context before MPI initialization when TRTLLM_USE_MPI_KVCACHE=1 is set. One possible approach is to rely on MPI environment variables to set the correct device before MPI initialization.</p></li>
|
||||
<li><p>Set the environment variables <code class="docutils literal notranslate"><span class="pre">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_PIPELINE_ERROR_HANDLING=y</span></code>, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.</p></li>
|
||||
</ol>
|
||||
<p><em>Q. Are there any guidelines for performance tuning of KV cache transfer?</em></p>
|
||||
<p>A. Depending on the user’s use case, certain sets of environment variables can help avoid poor KV cache transfer performance.</p>
|
||||
@ -689,9 +637,6 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
|
||||
</div>
|
||||
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
|
||||
<ul class="visible nav section-nav flex-column">
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#example">Example</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#benchmarks">Benchmarks</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#environment-variables">Environment Variables</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#troubleshooting-and-faq">Troubleshooting and FAQ</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#general-faqs">General FAQs</a></li>
|
||||
@ -792,9 +737,9 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -70,7 +70,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -348,6 +348,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -408,7 +409,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -797,9 +798,9 @@ the TensorRT-LLM C++ Executor API.</p>
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -70,7 +70,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -348,6 +348,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -408,7 +409,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -664,9 +665,9 @@
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -70,7 +70,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -348,6 +348,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -408,7 +409,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -982,9 +983,9 @@ is computed as:</p>
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc2';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -70,7 +70,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="1.0.0rc2" />
|
||||
<meta name="docsearch:version" content="1.0.0rc3" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -348,6 +348,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||||
@ -408,7 +409,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (Experimental)</a></li>
|
||||
</ul>
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="nav bd-sidenav">
|
||||
@ -1023,9 +1024,9 @@ The <code class="docutils literal notranslate"><span class="pre">GptDecoder</spa
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on July 06, 2025.</p>
|
||||
<p>Last updated on July 14, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/66f299a">66f299a</a>.</p>
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user