Update latest GitHub pages to v1.0.0rc4

This commit is contained in:
Kaiyu Xie 2025-07-22 03:09:09 +00:00
parent 36c7a40fb7
commit 51a490fa57
207 changed files with 21222 additions and 14146 deletions

View File

@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: ee79abf721be5d1b28815a3912832a13
config: dab0402c124e392bd849f27a08ca7210
tags: 645f666f9bcd5a90fca523b33c5a78b7

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -72,7 +72,7 @@ class CudaGraphConfig(BaseModel):
max_batch_size: int = Field(
default=0, description="Maximum batch size for CUDA graphs.")
padding_enabled: bool = Field(
enable_padding: bool = Field(
default=False,
description=
"If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
@ -88,6 +88,30 @@ class CudaGraphConfig(BaseModel):
return v
class MoeConfig(BaseModel):
"""
Configuration for MoE.
"""
backend: Literal["CUTLASS", "CUTEDSL", "WIDEEP", "TRTLLM",
"VANILLA"] = Field(default='CUTLASS',
description="MoE backend to use.")
max_num_tokens: Optional[int] = Field(
default=None,
description=
"If set, at most max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds max_num_tokens, the input tensors will be split into chunks and a for loop will be used."
)
load_balancer: Optional[Union[object, str]] = Field(
default=None,
description="Configuration for MoE load balancing.",
json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
@classmethod
def from_dict(cls, data: dict):
return cls(**data)
@dataclass
class _ParallelConfig:
''' The model distribution configs for LLM. '''
@ -330,8 +354,9 @@ class EagleDecodingConfig(DecodingBaseConfig):
class UserProvidedDecodingConfig(DecodingBaseConfig):
# Type should be Drafter, but it leads to circular import
drafter: object
# Cannot use real type annotations due to circular imports
drafter: object # Type is Drafter
resource_manager: object = None # Type is Optional[ResourceManager]
@classmethod
def from_dict(cls, data: dict):
@ -797,6 +822,10 @@ class KvCacheConfig(BaseModel, PybindMirror):
use_uvm: bool = Field(default=False,
description="Whether to use UVM for the KV cache.")
# This is a pure python field, not a pybind field. It is only for the Pytorch backend.
dtype: str = Field(default="auto",
description="The data type to use for the KV cache.")
def _to_pybind(self):
return _KvCacheConfig(
enable_block_reuse=self.enable_block_reuse,
@ -850,12 +879,20 @@ class CacheTransceiverConfig(BaseModel, PybindMirror):
"""
Configuration for the cache transceiver.
"""
max_num_tokens: Optional[int] = Field(
backend: Optional[Literal["default", "ucx", "nixl", "mpi"]] = Field(
default=None,
description=
"The communication backend type to use for the cache transceiver.")
max_tokens_in_buffer: Optional[int] = Field(
default=None,
description="The max number of tokens the transfer buffer can fit.")
def _to_pybind(self):
return _CacheTransceiverConfig(max_num_tokens=self.max_num_tokens)
return _CacheTransceiverConfig(
backend=self.backend,
max_tokens_in_buffer=self.max_tokens_in_buffer)
@dataclass
@ -1000,10 +1037,6 @@ class BaseLlmArgs(BaseModel):
lora_config: Optional[LoraConfig] = Field(
default=None, description="LoRA configuration for the model.")
# Quantization and calibration configurations
quant_config: Optional[QuantConfig] = Field(
default=None, description="Quantization config.", validate_default=True)
# Several options from ExecutorConfig, expanded here for less hierarchy
kv_cache_config: KvCacheConfig = Field(default_factory=KvCacheConfig,
description="KV cache config.")
@ -1184,13 +1217,6 @@ class BaseLlmArgs(BaseModel):
raise RuntimeError("Pre SM 80 GPUs do not support bfloat16")
return v
@field_validator("quant_config", mode='before')
@classmethod
def validate_quant_config(cls, v, info):
if v is None:
v = QuantConfig()
return v
@field_validator("gpus_per_node", mode='before')
@classmethod
def validate_gpus_per_node(cls, v, info):
@ -1262,7 +1288,8 @@ class BaseLlmArgs(BaseModel):
'pytorch', '_autodeploy'
]:
# Load parallel_config from the engine.
model_format = get_model_format(self.model)
model_format = get_model_format(
self.model, trust_remote_code=self.trust_remote_code)
if model_format is _ModelFormatKind.TLLM_ENGINE:
if self.build_config is not None:
@ -1330,6 +1357,15 @@ class BaseLlmArgs(BaseModel):
return self
@model_validator(mode="after")
def validate_runtime_args(self):
if self.max_batch_size is not None and self.max_num_tokens is not None:
if self.max_batch_size > self.max_num_tokens:
logger.warning(
f"max_batch_size [{self.max_batch_size}] should be less than or equal to max_num_tokens [{self.max_num_tokens}]"
)
return self
@model_validator(mode="after")
def validate_build_config_with_runtime_params(self):
# Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,
@ -1632,6 +1668,10 @@ class TrtLlmArgs(BaseLlmArgs):
calib_config: Optional[CalibConfig] = Field(
default=None, description="Calibration config.", validate_default=True)
# Quantization and calibration configurations
quant_config: Optional[QuantConfig] = Field(
default=None, description="Quantization config.", validate_default=True)
embedding_parallel_mode: str = Field(
default='SHARDING_ALONG_VOCAB',
description="The embedding parallel mode.")
@ -1669,6 +1709,13 @@ class TrtLlmArgs(BaseLlmArgs):
return CalibConfig()
return v
@field_validator("quant_config", mode='before')
@classmethod
def validate_quant_config(cls, v, info):
if v is None:
v = QuantConfig()
return v
@model_validator(mode="after")
def setup_embedding_parallel_mode(self):
if self.embedding_parallel_mode == 'NONE':
@ -1713,6 +1760,11 @@ class TrtLlmArgs(BaseLlmArgs):
f"Invalid build_cache_config: {self.enable_build_cache}")
return self
@model_validator(mode="after")
def validate_kv_cache_dtype(self):
assert self.kv_cache_config.dtype == "auto", "KvCacheConfig.dtype is not supported by the TensorRT backend."
return self
class LoadFormat(Enum):
AUTO = 0
@ -1757,7 +1809,7 @@ class TorchLlmArgs(BaseLlmArgs):
"Lower values trigger more frequent garbage collection.")
cuda_graph_config: Optional[CudaGraphConfig] = Field(
default=None,
default_factory=CudaGraphConfig,
description="CUDA graph config.If true, use CUDA graphs for decoding. \
CUDA graphs are only created for the batch sizes in cuda_graph_config.batch_sizes, \
and are enabled for batches that consist of decoding requests *only* \
@ -1768,26 +1820,12 @@ class TorchLlmArgs(BaseLlmArgs):
disable_overlap_scheduler: bool = Field(
default=False, description="Disable the overlap scheduler.")
moe_max_num_tokens: Optional[int] = Field(
default=None,
description=
"If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used."
)
moe_load_balancer: Optional[Union[object, str]] = Field(
default=None,
description="Configuration for MoE load balancing.",
json_schema_extra={
"type":
"Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
})
moe_config: MoeConfig = Field(default_factory=MoeConfig,
description="MoE config.")
attn_backend: str = Field(default='TRTLLM',
description="Attention backend to use.")
moe_backend: str = Field(default='CUTLASS',
description="MoE backend to use.")
enable_mixed_sampler: bool = Field(
default=False,
description=
@ -1800,9 +1838,6 @@ class TorchLlmArgs(BaseLlmArgs):
"If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies."
)
kv_cache_dtype: str = Field(default="auto",
description="Data type for KV cache.")
enable_iter_perf_stats: bool = Field(
default=False, description="Enable iteration performance statistics.")
@ -1855,6 +1890,31 @@ class TorchLlmArgs(BaseLlmArgs):
'LOWPRECISION',
'MNNVL']] = Field(default='AUTO',
description="Allreduce strategy to use.")
checkpoint_loader: Optional[object] = Field(
default=None,
description="The checkpoint loader to use for this LLM instance.",
json_schema_extra={
"type": "Optional[tensorrt_llm._torch.BaseCheckpointLoader]"
},
)
checkpoint_format: Optional[str] = Field(
default=None,
description="The format of the provided checkpoint.",
)
# PrivateVars
_quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
@property
def quant_config(self) -> QuantConfig:
if self._quant_config is None:
self._quant_config = QuantConfig()
return self._quant_config
@quant_config.setter
def quant_config(self, value: QuantConfig):
self._quant_config = value
# TODO: remove backend later
@field_validator('backend', mode='before')
@ -1889,25 +1949,6 @@ class TorchLlmArgs(BaseLlmArgs):
def extra_resource_managers(self, value: Dict[str, object]) -> None:
self._extra_resource_managers = value
@model_validator(mode="after")
def validate_moe_load_balancer(self):
from .._torch.model_config import MoeLoadBalancerConfig
if isinstance(self.moe_load_balancer, str):
if not os.path.exists(self.moe_load_balancer):
raise FileNotFoundError(
f"MoE load balancer config file not found: {self.moe_load_balancer}"
)
try:
with open(self.moe_load_balancer) as f:
moe_load_balancer_config = yaml.safe_load(f)
self.moe_load_balancer = MoeLoadBalancerConfig(
**moe_load_balancer_config)
except Exception as e:
raise ValueError(
f"Failed to load MoE load balancer config file: {self.moe_load_balancer}"
) from e
return self
@model_validator(mode="after")
def validate_stream_interval(self):
if self.stream_interval <= 0:
@ -1915,19 +1956,35 @@ class TorchLlmArgs(BaseLlmArgs):
f"stream_interval must be positive, got {self.stream_interval}")
return self
@model_validator(mode="after")
def validate_checkpoint_format(self):
if self.checkpoint_format is not None and self.checkpoint_loader is not None:
logger.warning(
"checkpoint_format and checkpoint_loader are both provided, "
"checkpoint_loader will be ignored.")
self.checkpoint_loader = None
if self.checkpoint_format is None and self.checkpoint_loader is None:
logger.info(
"neither checkpoint_format nor checkpoint_loader were provided, "
"checkpoint_format will be set to HF.")
self.checkpoint_format = "HF"
return self
@staticmethod
def _generate_cuda_graph_batch_sizes(max_batch_size: int,
padding_enabled: bool) -> List[int]:
enable_padding: bool) -> List[int]:
"""Generate a list of batch sizes for CUDA graphs.
Args:
max_batch_size: Maximum batch size to generate up to
padding_enabled: Whether padding is enabled, which affects the batch size distribution
enable_padding: Whether padding is enabled, which affects the batch size distribution
Returns:
List of batch sizes to create CUDA graphs for
"""
if padding_enabled:
if enable_padding:
batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
else:
batch_sizes = list(range(1, 32)) + [32, 64, 128]
@ -1947,6 +2004,25 @@ class TorchLlmArgs(BaseLlmArgs):
return batch_sizes
@model_validator(mode="after")
def validate_load_balancer(self) -> 'TorchLlmArgs':
from .._torch import MoeLoadBalancerConfig
if isinstance(self.moe_config.load_balancer, str):
if not os.path.exists(self.moe_config.load_balancer):
raise FileNotFoundError(
f"MoE load balancer config file not found: {self.moe_config.load_balancer}"
)
try:
with open(self.moe_config.load_balancer) as f:
moe_load_balancer_config = yaml.safe_load(f)
self.moe_config.load_balancer = MoeLoadBalancerConfig(
**moe_load_balancer_config)
except Exception as e:
raise ValueError(
f"Failed to load MoE load balancer config file: {self.load_balancer}"
) from e
return self
@model_validator(mode='after')
def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
"""Validate CUDA graph configuration.
@ -1965,7 +2041,7 @@ class TorchLlmArgs(BaseLlmArgs):
config.batch_sizes = sorted(config.batch_sizes)
if config.max_batch_size != 0:
if config.batch_sizes != self._generate_cuda_graph_batch_sizes(
config.max_batch_size, config.padding_enabled):
config.max_batch_size, config.enable_padding):
raise ValueError(
"Please don't set both cuda_graph_config.batch_sizes "
"and cuda_graph_config.max_batch_size.\n"
@ -1977,12 +2053,28 @@ class TorchLlmArgs(BaseLlmArgs):
else:
max_batch_size = config.max_batch_size or 128
generated_sizes = self._generate_cuda_graph_batch_sizes(
max_batch_size, config.padding_enabled)
max_batch_size, config.enable_padding)
config.batch_sizes = generated_sizes
config.max_batch_size = max_batch_size
return self
@model_validator(mode='after')
def sync_quant_config_with_kv_cache_config_dtype(self) -> 'TorchLlmArgs':
if self.kv_cache_config is None:
return self
assert self.quant_config is not None
if self.kv_cache_config.dtype == "auto":
return self
elif self.kv_cache_config.dtype == 'fp8':
self.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
else:
logger.warning(
f"Cannot sync quant_config.kv_cache_quant_algo with kv_cache_config.dtype of {self.kv_cache_config.dtype}, "
"please update the validator")
return self
# TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
def get_pytorch_backend_config(self) -> "PyTorchConfig":
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
@ -1996,17 +2088,17 @@ class TorchLlmArgs(BaseLlmArgs):
cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size
if self.cuda_graph_config else
CudaGraphConfig.model_fields['max_batch_size'].default,
cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled
cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding
if self.cuda_graph_config else
CudaGraphConfig.model_fields['padding_enabled'].default,
CudaGraphConfig.model_fields['enable_padding'].default,
disable_overlap_scheduler=self.disable_overlap_scheduler,
moe_max_num_tokens=self.moe_max_num_tokens,
moe_load_balancer=self.moe_load_balancer,
moe_max_num_tokens=self.moe_config.max_num_tokens,
moe_load_balancer=self.moe_config.load_balancer,
attn_backend=self.attn_backend,
moe_backend=self.moe_backend,
moe_backend=self.moe_config.backend,
enable_mixed_sampler=self.enable_mixed_sampler,
enable_trtllm_sampler=self.enable_trtllm_sampler,
kv_cache_dtype=self.kv_cache_dtype,
kv_cache_dtype=self.kv_cache_config.dtype,
enable_iter_perf_stats=self.enable_iter_perf_stats,
enable_iter_req_stats=self.enable_iter_req_stats,
print_iter_log=self.print_iter_log,
@ -2046,10 +2138,12 @@ def update_llm_args_with_extra_dict(
"enable_build_cache": BuildCacheConfig,
"speculative_config": DecodingBaseConfig,
"lora_config": LoraConfig,
"moe_config": MoeConfig,
}
for field_name, field_type in field_mapping.items():
if field_name in llm_args_dict:
if field_name == "speculative_config":
# Some fields need to be converted manually.
if field_name in ["speculative_config", "build_config"]:
llm_args_dict[field_name] = field_type.from_dict(
llm_args_dict[field_name])
else:
@ -2072,7 +2166,8 @@ def update_llm_args_with_extra_options(llm_args: Dict,
return llm_args
def get_model_format(model_dir: str) -> _ModelFormatKind:
def get_model_format(model_dir: str,
trust_remote_code: bool = False) -> _ModelFormatKind:
''' Get the format of the model. '''
if not (Path(model_dir) / 'config.json').exists():
raise ValueError(
@ -2091,7 +2186,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
PretrainedConfig.from_checkpoint(model_dir)
else:
model_format = _ModelFormatKind.HF
AutoConfig.from_hugging_face(model_dir)
AutoConfig.from_hugging_face(model_dir,
trust_remote_code=trust_remote_code)
except Exception as e:
raise ValueError(
f"Inferred model format {model_format}, but failed to load config.json: {e}"

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -672,9 +676,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -507,12 +511,14 @@
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">copy</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">dataclasses</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">json</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">math</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">shutil</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">functools</span><span class="w"> </span><span class="kn">import</span> <span class="n">cache</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Union</span>
@ -1056,6 +1062,18 @@
<span class="n">override_attri</span><span class="p">(</span><span class="s1">&#39;paged_state&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span></div>
<div class="viewcode-block" id="BuildConfig.get_build_config_defaults">
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.get_build_config_defaults">[docs]</a>
<span class="nd">@classmethod</span>
<span class="nd">@cache</span>
<span class="k">def</span><span class="w"> </span><span class="nf">get_build_config_defaults</span><span class="p">(</span><span class="bp">cls</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span>
<span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">:</span> <span class="n">field</span><span class="o">.</span><span class="n">default</span>
<span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">dataclasses</span><span class="o">.</span><span class="n">fields</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span>
<span class="k">if</span> <span class="n">field</span><span class="o">.</span><span class="n">default</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">dataclasses</span><span class="o">.</span><span class="n">MISSING</span>
<span class="p">}</span></div>
<div class="viewcode-block" id="BuildConfig.from_dict">
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.from_dict">[docs]</a>
<span class="nd">@classmethod</span>
@ -1063,48 +1081,75 @@
<span class="n">config</span> <span class="o">=</span> <span class="n">copy</span><span class="o">.</span><span class="n">deepcopy</span><span class="p">(</span>
<span class="n">config</span>
<span class="p">)</span> <span class="c1"># it just does not make sense to change the input arg `config`</span>
<span class="n">max_input_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_input_len&#39;</span><span class="p">)</span>
<span class="n">max_seq_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_seq_len&#39;</span><span class="p">)</span>
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_batch_size&#39;</span><span class="p">)</span>
<span class="n">max_beam_width</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_beam_width&#39;</span><span class="p">)</span>
<span class="n">max_num_tokens</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_num_tokens&#39;</span><span class="p">)</span>
<span class="n">opt_num_tokens</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;opt_num_tokens&#39;</span><span class="p">)</span>
<span class="n">opt_batch_size</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;opt_batch_size&#39;</span><span class="p">,</span> <span class="mi">8</span><span class="p">)</span>
<span class="n">max_prompt_embedding_table_size</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span>
<span class="s1">&#39;max_prompt_embedding_table_size&#39;</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
<span class="n">kv_cache_type</span> <span class="o">=</span> <span class="n">KVCacheType</span><span class="p">(</span>
<span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;kv_cache_type&#39;</span><span class="p">))</span> <span class="k">if</span> <span class="s1">&#39;plugin_config&#39;</span> <span class="ow">in</span> <span class="n">config</span> <span class="k">else</span> <span class="kc">None</span>
<span class="n">gather_context_logits</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;gather_context_logits&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">gather_generation_logits</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;gather_generation_logits&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">strongly_typed</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;strongly_typed&#39;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="n">force_num_profiles</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;force_num_profiles&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">weight_sparsity</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;weight_sparsity&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">defaults</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">get_build_config_defaults</span><span class="p">()</span>
<span class="n">max_input_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_input_len&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_input_len&#39;</span><span class="p">))</span>
<span class="n">max_seq_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_seq_len&#39;</span><span class="p">,</span> <span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_seq_len&#39;</span><span class="p">))</span>
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_batch_size&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_batch_size&#39;</span><span class="p">))</span>
<span class="n">max_beam_width</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_beam_width&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_beam_width&#39;</span><span class="p">))</span>
<span class="n">max_num_tokens</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_num_tokens&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_num_tokens&#39;</span><span class="p">))</span>
<span class="n">opt_num_tokens</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;opt_num_tokens&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;opt_num_tokens&#39;</span><span class="p">))</span>
<span class="n">opt_batch_size</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;opt_batch_size&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;opt_batch_size&#39;</span><span class="p">))</span>
<span class="n">max_prompt_embedding_table_size</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span>
<span class="s1">&#39;max_prompt_embedding_table_size&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_prompt_embedding_table_size&#39;</span><span class="p">))</span>
<span class="k">if</span> <span class="s2">&quot;kv_cache_type&quot;</span> <span class="ow">in</span> <span class="n">config</span> <span class="ow">and</span> <span class="n">config</span><span class="p">[</span><span class="s2">&quot;kv_cache_type&quot;</span><span class="p">]</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">kv_cache_type</span> <span class="o">=</span> <span class="n">KVCacheType</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;kv_cache_type&#39;</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">kv_cache_type</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">gather_context_logits</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span>
<span class="s1">&#39;gather_context_logits&#39;</span><span class="p">,</span> <span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;gather_context_logits&#39;</span><span class="p">))</span>
<span class="n">gather_generation_logits</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span>
<span class="s1">&#39;gather_generation_logits&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;gather_generation_logits&#39;</span><span class="p">))</span>
<span class="n">strongly_typed</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;strongly_typed&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;strongly_typed&#39;</span><span class="p">))</span>
<span class="n">force_num_profiles</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;force_num_profiles&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;force_num_profiles&#39;</span><span class="p">))</span>
<span class="n">weight_sparsity</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;weight_sparsity&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;weight_sparsity&#39;</span><span class="p">))</span>
<span class="n">profiling_verbosity</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;profiling_verbosity&#39;</span><span class="p">,</span>
<span class="s1">&#39;layer_names_only&#39;</span><span class="p">)</span>
<span class="n">enable_debug_output</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;enable_debug_output&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">max_draft_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_draft_len&#39;</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
<span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;speculative_decoding_mode&#39;</span><span class="p">,</span>
<span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NONE</span><span class="p">)</span>
<span class="n">use_refit</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;use_refit&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">input_timing_cache</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;input_timing_cache&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">output_timing_cache</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;output_timing_cache&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;profiling_verbosity&#39;</span><span class="p">))</span>
<span class="n">enable_debug_output</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;enable_debug_output&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;enable_debug_output&#39;</span><span class="p">))</span>
<span class="n">max_draft_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_draft_len&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_draft_len&#39;</span><span class="p">))</span>
<span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span>
<span class="s1">&#39;speculative_decoding_mode&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;speculative_decoding_mode&#39;</span><span class="p">))</span>
<span class="n">use_refit</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;use_refit&#39;</span><span class="p">,</span> <span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;use_refit&#39;</span><span class="p">))</span>
<span class="n">input_timing_cache</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;input_timing_cache&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;input_timing_cache&#39;</span><span class="p">))</span>
<span class="n">output_timing_cache</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;output_timing_cache&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;output_timing_cache&#39;</span><span class="p">))</span>
<span class="n">lora_config</span> <span class="o">=</span> <span class="n">LoraConfig</span><span class="o">.</span><span class="n">from_dict</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;lora_config&#39;</span><span class="p">,</span> <span class="p">{}))</span>
<span class="n">auto_parallel_config</span> <span class="o">=</span> <span class="n">AutoParallelConfig</span><span class="o">.</span><span class="n">from_dict</span><span class="p">(</span>
<span class="n">config</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;auto_parallel_config&#39;</span><span class="p">,</span> <span class="p">{}))</span>
<span class="n">max_encoder_input_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;max_encoder_input_len&#39;</span><span class="p">,</span> <span class="mi">1024</span><span class="p">)</span>
<span class="n">weight_streaming</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;weight_streaming&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">use_strip_plan</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;use_strip_plan&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">max_encoder_input_len</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span>
<span class="s1">&#39;max_encoder_input_len&#39;</span><span class="p">,</span> <span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;max_encoder_input_len&#39;</span><span class="p">))</span>
<span class="n">weight_streaming</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;weight_streaming&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;weight_streaming&#39;</span><span class="p">))</span>
<span class="n">use_strip_plan</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;use_strip_plan&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;use_strip_plan&#39;</span><span class="p">))</span>
<span class="k">if</span> <span class="n">plugin_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">plugin_config</span> <span class="o">=</span> <span class="n">PluginConfig</span><span class="p">()</span>
<span class="k">if</span> <span class="s2">&quot;plugin_config&quot;</span> <span class="ow">in</span> <span class="n">config</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span>
<span class="n">plugin_config</span><span class="o">.</span><span class="n">update_from_dict</span><span class="p">(</span><span class="n">config</span><span class="p">[</span><span class="s2">&quot;plugin_config&quot;</span><span class="p">])</span>
<span class="n">dry_run</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;dry_run&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">visualize_network</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;visualize_network&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">monitor_memory</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;monitor_memory&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">use_mrope</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;use_mrope&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="n">dry_run</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;dry_run&#39;</span><span class="p">,</span> <span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;dry_run&#39;</span><span class="p">))</span>
<span class="n">visualize_network</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;visualize_network&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;visualize_network&#39;</span><span class="p">))</span>
<span class="n">monitor_memory</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;monitor_memory&#39;</span><span class="p">,</span>
<span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;monitor_memory&#39;</span><span class="p">))</span>
<span class="n">use_mrope</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;use_mrope&#39;</span><span class="p">,</span> <span class="n">defaults</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;use_mrope&#39;</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span>
<span class="n">max_input_len</span><span class="o">=</span><span class="n">max_input_len</span><span class="p">,</span>
@ -1981,9 +2026,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -502,10 +506,10 @@
<a class="viewcode-back" href="../../llm-api/reference.html#tensorrt_llm.llmapi.DisaggregatedParams">[docs]</a>
<span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">class</span><span class="w"> </span><span class="nc">DisaggregatedParams</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Disaggregated seving parameters.</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Disaggregated serving parameters.</span>
<span class="sd"> Args:</span>
<span class="sd"> request_type (str): The type of request (&quot;context_only&quot; or &quot;generation_only&quot;)</span>
<span class="sd"> request_type (str): The type of request (&quot;context_only&quot; | &quot;generation_only&quot; | &quot;context_and_generation&quot;)</span>
<span class="sd"> first_gen_tokens (List[int]): The first tokens of the generation request</span>
<span class="sd"> ctx_request_id (int): The context request id</span>
<span class="sd"> opaque_state(bytes): Any additional state needing to be exchanged between context and gen instances</span>
@ -652,9 +656,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1268,9 +1272,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -775,9 +779,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -5738,6 +5742,51 @@
<span class="n">scaling_long_factors</span><span class="p">,</span> <span class="kc">False</span><span class="p">,</span> <span class="kc">True</span><span class="p">),</span> <span class="n">short_mscale</span></div>
<div class="viewcode-block" id="RopeEmbeddingUtils.create_sinusoidal_positions_long_rope_for_attention_plugin">
<a class="viewcode-back" href="../../python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_long_rope_for_attention_plugin">[docs]</a>
<span class="nd">@staticmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">create_sinusoidal_positions_long_rope_for_attention_plugin</span><span class="p">(</span>
<span class="n">num_pos</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">dim</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">theta</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">original_max_pos</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">short_factor</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span>
<span class="n">long_factor</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">):</span>
<span class="n">short_factor</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">short_factor</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">long_factor</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">long_factor</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">inv_freq</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">/</span> <span class="p">(</span><span class="n">theta</span><span class="o">**</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">dim</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span> <span class="o">/</span> <span class="n">dim</span><span class="p">))</span>
<span class="c1"># Short part</span>
<span class="n">inv_freq_short</span> <span class="o">=</span> <span class="n">inv_freq</span> <span class="o">/</span> <span class="n">short_factor</span>
<span class="n">t_short</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">min</span><span class="p">([</span><span class="n">num_pos</span><span class="p">,</span> <span class="n">original_max_pos</span><span class="p">]),</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">freqs_short</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s2">&quot;i,j-&gt;ij&quot;</span><span class="p">,</span> <span class="n">t_short</span><span class="p">,</span> <span class="n">inv_freq_short</span><span class="p">)</span>
<span class="c1"># Long part</span>
<span class="n">inv_freq_long</span> <span class="o">=</span> <span class="n">inv_freq</span> <span class="o">/</span> <span class="n">long_factor</span>
<span class="n">t_long</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_pos</span> <span class="o">-</span> <span class="n">original_max_pos</span><span class="p">]),</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span> <span class="o">+</span> <span class="n">original_max_pos</span>
<span class="n">freqs_long</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s2">&quot;i,j-&gt;ij&quot;</span><span class="p">,</span> <span class="n">t_long</span><span class="p">,</span> <span class="n">inv_freq_long</span><span class="p">)</span>
<span class="n">freqs</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">concatenate</span><span class="p">([</span><span class="n">freqs_short</span><span class="p">,</span> <span class="n">freqs_long</span><span class="p">],</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="n">sinusoid_inp</span> <span class="o">=</span> <span class="n">freqs</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)[</span><span class="o">...</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">newaxis</span><span class="p">]</span>
<span class="c1"># Apply scaling</span>
<span class="n">scale</span> <span class="o">=</span> <span class="n">num_pos</span> <span class="o">/</span> <span class="n">original_max_pos</span>
<span class="n">scaling_factor</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">1.0</span> <span class="o">+</span> <span class="n">np</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">scale</span><span class="p">)</span> <span class="o">/</span> <span class="n">np</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">original_max_pos</span><span class="p">))</span>
<span class="c1"># fuse cos/sin into float2 (cos, sin).</span>
<span class="n">concat</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">concatenate</span><span class="p">(</span>
<span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">cos</span><span class="p">(</span><span class="n">sinusoid_inp</span><span class="p">)</span> <span class="o">*</span> <span class="n">scaling_factor</span><span class="p">,</span>
<span class="n">np</span><span class="o">.</span><span class="n">sin</span><span class="p">(</span><span class="n">sinusoid_inp</span><span class="p">)</span> <span class="o">*</span> <span class="n">scaling_factor</span><span class="p">),</span>
<span class="n">axis</span><span class="o">=-</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="kc">None</span><span class="p">,</span> <span class="n">concat</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span></div>
<div class="viewcode-block" id="RopeEmbeddingUtils.create_fake_weight">
<a class="viewcode-back" href="../../python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.RopeEmbeddingUtils.create_fake_weight">[docs]</a>
<span class="nd">@staticmethod</span>
@ -8706,9 +8755,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -631,9 +635,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -3496,9 +3500,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -638,9 +642,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -887,9 +891,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1354,9 +1358,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1202,9 +1206,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1228,9 +1232,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -992,9 +996,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -647,9 +651,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -931,9 +935,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -834,9 +838,9 @@
<span class="c1"># With pytorch backend, py_executor has logic to handle max_tokens of 1,</span>
<span class="c1"># so set to 1 to avoid allocating unnecessary KV cache blocks for single request</span>
<span class="c1"># TODO: Also support for trt backend</span>
<span class="k">if</span> <span class="p">(</span><span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="ow">and</span> <span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">&quot;context_only&quot;</span>
<span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span><span class="p">):</span>
<span class="n">is_ctx_only</span> <span class="o">=</span> <span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">&quot;context_only&quot;</span>
<span class="n">is_gen_only</span> <span class="o">=</span> <span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">&quot;generation_only&quot;</span>
<span class="k">if</span> <span class="n">is_ctx_only</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span><span class="p">:</span>
<span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">1</span>
<span class="n">inputs</span> <span class="o">=</span> <span class="n">prompt_inputs</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span>
@ -901,7 +905,8 @@
<span class="bp">self</span><span class="o">.</span><span class="n">_check_arguments</span><span class="p">(</span>
<span class="nb">len</span><span class="p">(</span><span class="n">prompt_token_ids</span><span class="p">),</span>
<span class="nb">len</span><span class="p">(</span><span class="n">query_token_ids</span><span class="p">)</span> <span class="k">if</span> <span class="n">query_token_ids</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">sampling_params</span><span class="p">)</span>
<span class="n">sampling_params</span><span class="p">,</span>
<span class="n">is_gen_only</span><span class="o">=</span><span class="n">is_gen_only</span><span class="p">)</span>
<span class="k">if</span> <span class="n">_postproc_params</span><span class="p">:</span>
<span class="n">_postproc_params</span><span class="o">.</span><span class="n">postproc_args</span><span class="o">.</span><span class="n">num_prompt_tokens</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span>
<span class="n">prompt_token_ids</span><span class="p">)</span>
@ -1029,7 +1034,8 @@
<span class="k">return</span> <span class="n">sampling_params</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_check_arguments</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prompt_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">query_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">sampling_params</span><span class="p">:</span> <span class="n">SamplingParams</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">sampling_params</span><span class="p">:</span> <span class="n">SamplingParams</span><span class="p">,</span>
<span class="n">is_gen_only</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;_autodeploy&quot;</span><span class="p">]:</span>
<span class="c1"># TODO: remove these checks after PyTorch backend</span>
@ -1042,6 +1048,14 @@
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;PyTorch backend currently only supports `logprobs=1`. Received `logprobs=</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span><span class="si">}</span><span class="s2">` (Top</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span><span class="si">}</span><span class="s2"> logprobs). Please set `logprobs=1` in `sampling_params` instead.&quot;</span>
<span class="p">)</span>
<span class="c1"># Check prompt length and query length against max_num_tokens to filter illegal requests.</span>
<span class="c1"># Skip check for gen-only requests</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;pytorch&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">enable_chunked_prefill</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">is_gen_only</span><span class="p">:</span>
<span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_num_tokens</span>
<span class="k">if</span> <span class="n">max_num_tokens</span> <span class="ow">and</span> <span class="n">prompt_len</span> <span class="o">/</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">+</span> <span class="n">query_len</span> <span class="o">&gt;</span> <span class="n">max_num_tokens</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;The sum of prompt length (</span><span class="si">{</span><span class="n">prompt_len</span><span class="o">/</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">), query length (</span><span class="si">{</span><span class="n">query_len</span><span class="si">}</span><span class="s2">) should not exceed &quot;</span>
<span class="sa">f</span><span class="s2">&quot;max_num_tokens (</span><span class="si">{</span><span class="n">max_num_tokens</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
<span class="k">return</span>
<span class="n">build_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
@ -1058,7 +1072,7 @@
<span class="p">(</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span> <span class="ow">or</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">max_seq_len</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;The sum of prompt length (</span><span class="si">{</span><span class="n">prompt_len</span><span class="o">/</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">) and query length (</span><span class="si">{</span><span class="n">query_len</span><span class="si">}</span><span class="s2">) max_tokens (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span><span class="si">}</span><span class="s2">) should not exceed &quot;</span>
<span class="sa">f</span><span class="s2">&quot;max_seq_len (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
<span class="sa">f</span><span class="s2">&quot;max_seq_len (</span><span class="si">{</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">==</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="p">:</span>
@ -1463,7 +1477,11 @@
<span class="n">speculative_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
<span class="n">hf_model_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_hf_model_dir</span><span class="p">,</span>
<span class="n">max_input_len</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_input_len</span><span class="p">,</span>
<span class="n">max_seq_len</span><span class="o">=</span><span class="n">max_seq_len</span><span class="p">)</span>
<span class="n">max_seq_len</span><span class="o">=</span><span class="n">max_seq_len</span><span class="p">,</span>
<span class="n">checkpoint_format</span><span class="o">=</span><span class="kc">None</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;_autodeploy&quot;</span> <span class="k">else</span>
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">checkpoint_format</span><span class="p">,</span>
<span class="n">checkpoint_loader</span><span class="o">=</span><span class="kc">None</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;_autodeploy&quot;</span> <span class="k">else</span>
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">checkpoint_loader</span><span class="p">)</span>
<span class="c1"># TODO: revisit gather_context_logits</span>
<span class="n">return_logits</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">gather_generation_logits</span>
@ -1650,9 +1668,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -568,7 +572,7 @@
<span class="n">max_batch_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Maximum batch size for CUDA graphs.&quot;</span><span class="p">)</span>
<span class="n">padding_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">enable_padding</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance.&quot;</span>
@ -588,6 +592,36 @@
<div class="viewcode-block" id="MoeConfig">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MoeConfig">[docs]</a>
<span class="k">class</span><span class="w"> </span><span class="nc">MoeConfig</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Configuration for MoE.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">backend</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;CUTLASS&quot;</span><span class="p">,</span> <span class="s2">&quot;CUTEDSL&quot;</span><span class="p">,</span> <span class="s2">&quot;WIDEEP&quot;</span><span class="p">,</span> <span class="s2">&quot;TRTLLM&quot;</span><span class="p">,</span>
<span class="s2">&quot;VANILLA&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">&#39;CUTLASS&#39;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;MoE backend to use.&quot;</span><span class="p">)</span>
<span class="n">max_num_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;If set, at most max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds max_num_tokens, the input tensors will be split into chunks and a for loop will be used.&quot;</span>
<span class="p">)</span>
<span class="n">load_balancer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">object</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Configuration for MoE load balancing.&quot;</span><span class="p">,</span>
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="s2">&quot;Union[MoeLoadBalancerConfig, str]&quot;</span><span class="p">})</span>
<div class="viewcode-block" id="MoeConfig.from_dict">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MoeConfig.from_dict">[docs]</a>
<span class="nd">@classmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
</div>
<span class="nd">@dataclass</span>
<span class="k">class</span><span class="w"> </span><span class="nc">_ParallelConfig</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&#39;&#39;&#39; The model distribution configs for LLM. &#39;&#39;&#39;</span>
@ -862,8 +896,9 @@
<div class="viewcode-block" id="UserProvidedDecodingConfig">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.UserProvidedDecodingConfig">[docs]</a>
<span class="k">class</span><span class="w"> </span><span class="nc">UserProvidedDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
<span class="c1"># Type should be Drafter, but it leads to circular import</span>
<span class="n">drafter</span><span class="p">:</span> <span class="nb">object</span>
<span class="c1"># Cannot use real type annotations due to circular imports</span>
<span class="n">drafter</span><span class="p">:</span> <span class="nb">object</span> <span class="c1"># Type is Drafter</span>
<span class="n">resource_manager</span><span class="p">:</span> <span class="nb">object</span> <span class="o">=</span> <span class="kc">None</span> <span class="c1"># Type is Optional[ResourceManager]</span>
<div class="viewcode-block" id="UserProvidedDecodingConfig.from_dict">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.UserProvidedDecodingConfig.from_dict">[docs]</a>
@ -1398,6 +1433,10 @@
<span class="n">use_uvm</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Whether to use UVM for the KV cache.&quot;</span><span class="p">)</span>
<span class="c1"># This is a pure python field, not a pybind field. It is only for the Pytorch backend.</span>
<span class="n">dtype</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The data type to use for the KV cache.&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_to_pybind</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">_KvCacheConfig</span><span class="p">(</span>
<span class="n">enable_block_reuse</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_block_reuse</span><span class="p">,</span>
@ -1457,12 +1496,20 @@
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Configuration for the cache transceiver.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">max_num_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">backend</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;default&quot;</span><span class="p">,</span> <span class="s2">&quot;ucx&quot;</span><span class="p">,</span> <span class="s2">&quot;nixl&quot;</span><span class="p">,</span> <span class="s2">&quot;mpi&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;The communication backend type to use for the cache transceiver.&quot;</span><span class="p">)</span>
<span class="n">max_tokens_in_buffer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The max number of tokens the transfer buffer can fit.&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_to_pybind</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="n">_CacheTransceiverConfig</span><span class="p">(</span><span class="n">max_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="p">)</span></div>
<span class="k">return</span> <span class="n">_CacheTransceiverConfig</span><span class="p">(</span>
<span class="n">backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="p">,</span>
<span class="n">max_tokens_in_buffer</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">max_tokens_in_buffer</span><span class="p">)</span></div>
@ -1608,10 +1655,6 @@
<span class="n">lora_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">LoraConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;LoRA configuration for the model.&quot;</span><span class="p">)</span>
<span class="c1"># Quantization and calibration configurations</span>
<span class="n">quant_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">QuantConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Quantization config.&quot;</span><span class="p">,</span> <span class="n">validate_default</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># Several options from ExecutorConfig, expanded here for less hierarchy</span>
<span class="n">kv_cache_config</span><span class="p">:</span> <span class="n">KvCacheConfig</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="n">KvCacheConfig</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;KV cache config.&quot;</span><span class="p">)</span>
@ -1792,13 +1835,6 @@
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">&quot;Pre SM 80 GPUs do not support bfloat16&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">v</span>
<span class="nd">@field_validator</span><span class="p">(</span><span class="s2">&quot;quant_config&quot;</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;before&#39;</span><span class="p">)</span>
<span class="nd">@classmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_quant_config</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">,</span> <span class="n">info</span><span class="p">):</span>
<span class="k">if</span> <span class="n">v</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">v</span> <span class="o">=</span> <span class="n">QuantConfig</span><span class="p">()</span>
<span class="k">return</span> <span class="n">v</span>
<span class="nd">@field_validator</span><span class="p">(</span><span class="s2">&quot;gpus_per_node&quot;</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;before&#39;</span><span class="p">)</span>
<span class="nd">@classmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_gpus_per_node</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">,</span> <span class="n">info</span><span class="p">):</span>
@ -1870,7 +1906,8 @@
<span class="s1">&#39;pytorch&#39;</span><span class="p">,</span> <span class="s1">&#39;_autodeploy&#39;</span>
<span class="p">]:</span>
<span class="c1"># Load parallel_config from the engine.</span>
<span class="n">model_format</span> <span class="o">=</span> <span class="n">get_model_format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
<span class="n">model_format</span> <span class="o">=</span> <span class="n">get_model_format</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">,</span> <span class="n">trust_remote_code</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">trust_remote_code</span><span class="p">)</span>
<span class="k">if</span> <span class="n">model_format</span> <span class="ow">is</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">TLLM_ENGINE</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
@ -1938,6 +1975,15 @@
<span class="k">return</span> <span class="bp">self</span>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_runtime_args</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">&gt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;max_batch_size [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">] should be less than or equal to max_num_tokens [</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="si">}</span><span class="s2">]&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_build_config_with_runtime_params</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="c1"># Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,</span>
@ -2242,6 +2288,10 @@
<span class="n">calib_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CalibConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Calibration config.&quot;</span><span class="p">,</span> <span class="n">validate_default</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># Quantization and calibration configurations</span>
<span class="n">quant_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">QuantConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Quantization config.&quot;</span><span class="p">,</span> <span class="n">validate_default</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">embedding_parallel_mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="s1">&#39;SHARDING_ALONG_VOCAB&#39;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The embedding parallel mode.&quot;</span><span class="p">)</span>
@ -2282,6 +2332,16 @@
<span class="k">return</span> <span class="n">v</span></div>
<div class="viewcode-block" id="TrtLlmArgs.validate_quant_config">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.validate_quant_config">[docs]</a>
<span class="nd">@field_validator</span><span class="p">(</span><span class="s2">&quot;quant_config&quot;</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;before&#39;</span><span class="p">)</span>
<span class="nd">@classmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_quant_config</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">,</span> <span class="n">info</span><span class="p">):</span>
<span class="k">if</span> <span class="n">v</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">v</span> <span class="o">=</span> <span class="n">QuantConfig</span><span class="p">()</span>
<span class="k">return</span> <span class="n">v</span></div>
<div class="viewcode-block" id="TrtLlmArgs.setup_embedding_parallel_mode">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.setup_embedding_parallel_mode">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
@ -2333,6 +2393,14 @@
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Invalid build_cache_config: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span></div>
<div class="viewcode-block" id="TrtLlmArgs.validate_kv_cache_dtype">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.validate_kv_cache_dtype">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_kv_cache_dtype</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">dtype</span> <span class="o">==</span> <span class="s2">&quot;auto&quot;</span><span class="p">,</span> <span class="s2">&quot;KvCacheConfig.dtype is not supported by the TensorRT backend.&quot;</span>
<span class="k">return</span> <span class="bp">self</span></div>
</div>
@ -2385,7 +2453,7 @@
<span class="s2">&quot;Lower values trigger more frequent garbage collection.&quot;</span><span class="p">)</span>
<span class="n">cuda_graph_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CudaGraphConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">default_factory</span><span class="o">=</span><span class="n">CudaGraphConfig</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;CUDA graph config.If true, use CUDA graphs for decoding. </span><span class="se">\</span>
<span class="s2"> CUDA graphs are only created for the batch sizes in cuda_graph_config.batch_sizes, </span><span class="se">\</span>
<span class="s2"> and are enabled for batches that consist of decoding requests *only* </span><span class="se">\</span>
@ -2396,26 +2464,12 @@
<span class="n">disable_overlap_scheduler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Disable the overlap scheduler.&quot;</span><span class="p">)</span>
<span class="n">moe_max_num_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used.&quot;</span>
<span class="p">)</span>
<span class="n">moe_load_balancer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">object</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Configuration for MoE load balancing.&quot;</span><span class="p">,</span>
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;type&quot;</span><span class="p">:</span>
<span class="s2">&quot;Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]&quot;</span>
<span class="p">})</span>
<span class="n">moe_config</span><span class="p">:</span> <span class="n">MoeConfig</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="n">MoeConfig</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;MoE config.&quot;</span><span class="p">)</span>
<span class="n">attn_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">&#39;TRTLLM&#39;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Attention backend to use.&quot;</span><span class="p">)</span>
<span class="n">moe_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">&#39;CUTLASS&#39;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;MoE backend to use.&quot;</span><span class="p">)</span>
<span class="n">enable_mixed_sampler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
@ -2428,9 +2482,6 @@
<span class="s2">&quot;If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies.&quot;</span>
<span class="p">)</span>
<span class="n">kv_cache_dtype</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s2">&quot;auto&quot;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Data type for KV cache.&quot;</span><span class="p">)</span>
<span class="n">enable_iter_perf_stats</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable iteration performance statistics.&quot;</span><span class="p">)</span>
@ -2483,6 +2534,31 @@
<span class="s1">&#39;LOWPRECISION&#39;</span><span class="p">,</span>
<span class="s1">&#39;MNNVL&#39;</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">&#39;AUTO&#39;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;Allreduce strategy to use.&quot;</span><span class="p">)</span>
<span class="n">checkpoint_loader</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The checkpoint loader to use for this LLM instance.&quot;</span><span class="p">,</span>
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="s2">&quot;Optional[tensorrt_llm._torch.BaseCheckpointLoader]&quot;</span>
<span class="p">},</span>
<span class="p">)</span>
<span class="n">checkpoint_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The format of the provided checkpoint.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># PrivateVars</span>
<span class="n">_quant_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">QuantConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">quant_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">QuantConfig</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_quant_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_quant_config</span> <span class="o">=</span> <span class="n">QuantConfig</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_quant_config</span>
<span class="nd">@quant_config</span><span class="o">.</span><span class="n">setter</span>
<span class="k">def</span><span class="w"> </span><span class="nf">quant_config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">QuantConfig</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_quant_config</span> <span class="o">=</span> <span class="n">value</span>
<span class="c1"># TODO: remove backend later</span>
<div class="viewcode-block" id="TorchLlmArgs.init_backend">
@ -2523,28 +2599,6 @@
<span class="k">def</span><span class="w"> </span><span class="nf">extra_resource_managers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_extra_resource_managers</span> <span class="o">=</span> <span class="n">value</span>
<div class="viewcode-block" id="TorchLlmArgs.validate_moe_load_balancer">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_moe_load_balancer">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_moe_load_balancer</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.._torch.model_config</span><span class="w"> </span><span class="kn">import</span> <span class="n">MoeLoadBalancerConfig</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;MoE load balancer config file not found: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
<span class="n">moe_load_balancer_config</span> <span class="o">=</span> <span class="n">yaml</span><span class="o">.</span><span class="n">safe_load</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span> <span class="o">=</span> <span class="n">MoeLoadBalancerConfig</span><span class="p">(</span>
<span class="o">**</span><span class="n">moe_load_balancer_config</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Failed to load MoE load balancer config file: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span> <span class="kn">from</span><span class="w"> </span><span class="nn">e</span>
<span class="k">return</span> <span class="bp">self</span></div>
<div class="viewcode-block" id="TorchLlmArgs.validate_stream_interval">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_stream_interval">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
@ -2555,19 +2609,38 @@
<span class="k">return</span> <span class="bp">self</span></div>
<div class="viewcode-block" id="TorchLlmArgs.validate_checkpoint_format">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_checkpoint_format">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_checkpoint_format</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">checkpoint_format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">checkpoint_loader</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
<span class="s2">&quot;checkpoint_format and checkpoint_loader are both provided, &quot;</span>
<span class="s2">&quot;checkpoint_loader will be ignored.&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">checkpoint_loader</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">checkpoint_format</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">checkpoint_loader</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
<span class="s2">&quot;neither checkpoint_format nor checkpoint_loader were provided, &quot;</span>
<span class="s2">&quot;checkpoint_format will be set to HF.&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">checkpoint_format</span> <span class="o">=</span> <span class="s2">&quot;HF&quot;</span>
<span class="k">return</span> <span class="bp">self</span></div>
<span class="nd">@staticmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_generate_cuda_graph_batch_sizes</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">padding_enabled</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="n">enable_padding</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Generate a list of batch sizes for CUDA graphs.</span>
<span class="sd"> Args:</span>
<span class="sd"> max_batch_size: Maximum batch size to generate up to</span>
<span class="sd"> padding_enabled: Whether padding is enabled, which affects the batch size distribution</span>
<span class="sd"> enable_padding: Whether padding is enabled, which affects the batch size distribution</span>
<span class="sd"> Returns:</span>
<span class="sd"> List of batch sizes to create CUDA graphs for</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">padding_enabled</span><span class="p">:</span>
<span class="k">if</span> <span class="n">enable_padding</span><span class="p">:</span>
<span class="n">batch_sizes</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="n">i</span> <span class="o">*</span> <span class="mi">8</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">17</span><span class="p">)]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">batch_sizes</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">32</span><span class="p">))</span> <span class="o">+</span> <span class="p">[</span><span class="mi">32</span><span class="p">,</span> <span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">]</span>
@ -2587,6 +2660,28 @@
<span class="k">return</span> <span class="n">batch_sizes</span>
<div class="viewcode-block" id="TorchLlmArgs.validate_load_balancer">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_load_balancer">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s2">&quot;after&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_load_balancer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s1">&#39;TorchLlmArgs&#39;</span><span class="p">:</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.._torch</span><span class="w"> </span><span class="kn">import</span> <span class="n">MoeLoadBalancerConfig</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">load_balancer</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">load_balancer</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;MoE load balancer config file not found: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">load_balancer</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">load_balancer</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
<span class="n">moe_load_balancer_config</span> <span class="o">=</span> <span class="n">yaml</span><span class="o">.</span><span class="n">safe_load</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">load_balancer</span> <span class="o">=</span> <span class="n">MoeLoadBalancerConfig</span><span class="p">(</span>
<span class="o">**</span><span class="n">moe_load_balancer_config</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Failed to load MoE load balancer config file: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">load_balancer</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span> <span class="kn">from</span><span class="w"> </span><span class="nn">e</span>
<span class="k">return</span> <span class="bp">self</span></div>
<div class="viewcode-block" id="TorchLlmArgs.validate_cuda_graph_config">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s1">&#39;after&#39;</span><span class="p">)</span>
@ -2607,7 +2702,7 @@
<span class="n">config</span><span class="o">.</span><span class="n">batch_sizes</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">config</span><span class="o">.</span><span class="n">batch_sizes</span><span class="p">)</span>
<span class="k">if</span> <span class="n">config</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">if</span> <span class="n">config</span><span class="o">.</span><span class="n">batch_sizes</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate_cuda_graph_batch_sizes</span><span class="p">(</span>
<span class="n">config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">,</span> <span class="n">config</span><span class="o">.</span><span class="n">padding_enabled</span><span class="p">):</span>
<span class="n">config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">,</span> <span class="n">config</span><span class="o">.</span><span class="n">enable_padding</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Please don&#39;t set both cuda_graph_config.batch_sizes &quot;</span>
<span class="s2">&quot;and cuda_graph_config.max_batch_size.</span><span class="se">\n</span><span class="s2">&quot;</span>
@ -2619,13 +2714,32 @@
<span class="k">else</span><span class="p">:</span>
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">config</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">or</span> <span class="mi">128</span>
<span class="n">generated_sizes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate_cuda_graph_batch_sizes</span><span class="p">(</span>
<span class="n">max_batch_size</span><span class="p">,</span> <span class="n">config</span><span class="o">.</span><span class="n">padding_enabled</span><span class="p">)</span>
<span class="n">max_batch_size</span><span class="p">,</span> <span class="n">config</span><span class="o">.</span><span class="n">enable_padding</span><span class="p">)</span>
<span class="n">config</span><span class="o">.</span><span class="n">batch_sizes</span> <span class="o">=</span> <span class="n">generated_sizes</span>
<span class="n">config</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">max_batch_size</span>
<span class="k">return</span> <span class="bp">self</span></div>
<div class="viewcode-block" id="TorchLlmArgs.sync_quant_config_with_kv_cache_config_dtype">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.sync_quant_config_with_kv_cache_config_dtype">[docs]</a>
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s1">&#39;after&#39;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">sync_quant_config_with_kv_cache_config_dtype</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s1">&#39;TorchLlmArgs&#39;</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">dtype</span> <span class="o">==</span> <span class="s2">&quot;auto&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">dtype</span> <span class="o">==</span> <span class="s1">&#39;fp8&#39;</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">kv_cache_quant_algo</span> <span class="o">=</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">FP8</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Cannot sync quant_config.kv_cache_quant_algo with kv_cache_config.dtype of </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">dtype</span><span class="si">}</span><span class="s2">, &quot;</span>
<span class="s2">&quot;please update the validator&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span></div>
<span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
<div class="viewcode-block" id="TorchLlmArgs.get_pytorch_backend_config">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config">[docs]</a>
@ -2641,17 +2755,17 @@
<span class="n">cuda_graph_max_batch_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_config</span><span class="o">.</span><span class="n">max_batch_size</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_config</span> <span class="k">else</span>
<span class="n">CudaGraphConfig</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="s1">&#39;max_batch_size&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">default</span><span class="p">,</span>
<span class="n">cuda_graph_padding_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_config</span><span class="o">.</span><span class="n">padding_enabled</span>
<span class="n">cuda_graph_padding_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_config</span><span class="o">.</span><span class="n">enable_padding</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_config</span> <span class="k">else</span>
<span class="n">CudaGraphConfig</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="s1">&#39;padding_enabled&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">default</span><span class="p">,</span>
<span class="n">CudaGraphConfig</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="s1">&#39;enable_padding&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">default</span><span class="p">,</span>
<span class="n">disable_overlap_scheduler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">disable_overlap_scheduler</span><span class="p">,</span>
<span class="n">moe_max_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_max_num_tokens</span><span class="p">,</span>
<span class="n">moe_load_balancer</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">,</span>
<span class="n">moe_max_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">max_num_tokens</span><span class="p">,</span>
<span class="n">moe_load_balancer</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">load_balancer</span><span class="p">,</span>
<span class="n">attn_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">attn_backend</span><span class="p">,</span>
<span class="n">moe_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_backend</span><span class="p">,</span>
<span class="n">moe_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_config</span><span class="o">.</span><span class="n">backend</span><span class="p">,</span>
<span class="n">enable_mixed_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_mixed_sampler</span><span class="p">,</span>
<span class="n">enable_trtllm_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_trtllm_sampler</span><span class="p">,</span>
<span class="n">kv_cache_dtype</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_dtype</span><span class="p">,</span>
<span class="n">kv_cache_dtype</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">enable_iter_perf_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_perf_stats</span><span class="p">,</span>
<span class="n">enable_iter_req_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_req_stats</span><span class="p">,</span>
<span class="n">print_iter_log</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">print_iter_log</span><span class="p">,</span>
@ -2693,10 +2807,12 @@
<span class="s2">&quot;enable_build_cache&quot;</span><span class="p">:</span> <span class="n">BuildCacheConfig</span><span class="p">,</span>
<span class="s2">&quot;speculative_config&quot;</span><span class="p">:</span> <span class="n">DecodingBaseConfig</span><span class="p">,</span>
<span class="s2">&quot;lora_config&quot;</span><span class="p">:</span> <span class="n">LoraConfig</span><span class="p">,</span>
<span class="s2">&quot;moe_config&quot;</span><span class="p">:</span> <span class="n">MoeConfig</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">for</span> <span class="n">field_name</span><span class="p">,</span> <span class="n">field_type</span> <span class="ow">in</span> <span class="n">field_mapping</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="k">if</span> <span class="n">field_name</span> <span class="ow">in</span> <span class="n">llm_args_dict</span><span class="p">:</span>
<span class="k">if</span> <span class="n">field_name</span> <span class="o">==</span> <span class="s2">&quot;speculative_config&quot;</span><span class="p">:</span>
<span class="c1"># Some fields need to be converted manually.</span>
<span class="k">if</span> <span class="n">field_name</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;speculative_config&quot;</span><span class="p">,</span> <span class="s2">&quot;build_config&quot;</span><span class="p">]:</span>
<span class="n">llm_args_dict</span><span class="p">[</span><span class="n">field_name</span><span class="p">]</span> <span class="o">=</span> <span class="n">field_type</span><span class="o">.</span><span class="n">from_dict</span><span class="p">(</span>
<span class="n">llm_args_dict</span><span class="p">[</span><span class="n">field_name</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
@ -2719,7 +2835,8 @@
<span class="k">return</span> <span class="n">llm_args</span>
<span class="k">def</span><span class="w"> </span><span class="nf">get_model_format</span><span class="p">(</span><span class="n">model_dir</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">_ModelFormatKind</span><span class="p">:</span>
<span class="k">def</span><span class="w"> </span><span class="nf">get_model_format</span><span class="p">(</span><span class="n">model_dir</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">_ModelFormatKind</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&#39;&#39;&#39; Get the format of the model. &#39;&#39;&#39;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;config.json&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">exists</span><span class="p">():</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
@ -2738,7 +2855,8 @@
<span class="n">PretrainedConfig</span><span class="o">.</span><span class="n">from_checkpoint</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
<span class="n">AutoConfig</span><span class="o">.</span><span class="n">from_hugging_face</span><span class="p">(</span><span class="n">model_dir</span><span class="p">)</span>
<span class="n">AutoConfig</span><span class="o">.</span><span class="n">from_hugging_face</span><span class="p">(</span><span class="n">model_dir</span><span class="p">,</span>
<span class="n">trust_remote_code</span><span class="o">=</span><span class="n">trust_remote_code</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Inferred model format </span><span class="si">{</span><span class="n">model_format</span><span class="si">}</span><span class="s2">, but failed to load config.json: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span>
@ -2865,9 +2983,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1133,9 +1137,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -865,9 +869,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1169,9 +1173,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -777,9 +781,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -794,9 +798,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -993,9 +997,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -822,9 +826,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -653,9 +657,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -906,9 +910,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -804,9 +808,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -668,9 +672,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -794,9 +798,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -888,9 +892,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -970,9 +974,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1006,9 +1010,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1942,9 +1946,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -2847,9 +2851,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -729,9 +733,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -891,9 +895,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -819,9 +823,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1011,9 +1015,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -938,9 +942,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1041,9 +1045,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -667,9 +671,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -819,9 +823,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -759,9 +763,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -893,9 +897,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1241,9 +1245,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1086,9 +1090,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -726,9 +730,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -876,9 +880,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -2187,9 +2191,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1253,9 +1257,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -2651,9 +2655,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -791,9 +795,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -725,9 +729,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -793,9 +797,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -796,9 +800,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -840,9 +844,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -936,9 +940,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1239,9 +1243,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -926,9 +930,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1467,9 +1471,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1035,9 +1039,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1882,9 +1886,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1150,9 +1154,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -5438,9 +5442,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1097,9 +1101,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1609,9 +1613,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1819,9 +1823,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -3217,7 +3221,7 @@
<span class="p">)</span>
<span class="n">image</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;llava_onevision&#39;</span><span class="p">]:</span>
<span class="n">pre_prompt</span> <span class="o">=</span> <span class="s2">&quot;&lt;|im_start|&gt;user &quot;</span>
<span class="n">pre_prompt</span> <span class="o">=</span> <span class="s2">&quot;&lt;|im_start|&gt;user &quot;</span> <span class="o">+</span> <span class="s2">&quot;&lt;video&gt;&quot;</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">video_path</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="s2">&quot;&lt;image&gt;&quot;</span>
<span class="k">if</span> <span class="n">input_text</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">input_text</span> <span class="o">=</span> <span class="s2">&quot;Question: which city is this? Answer:&quot;</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">video_path</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="s2">&quot;Why is this video funny?&quot;</span>
<span class="n">post_prompt</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="si">{</span><span class="n">input_text</span><span class="si">}</span><span class="s2">&lt;|im_end|&gt;&lt;|im_start|&gt;assistant</span><span class="se">\n</span><span class="s2">&quot;</span>
@ -3228,7 +3232,7 @@
<span class="n">text</span><span class="o">=</span><span class="n">prompt</span><span class="p">,</span>
<span class="n">return_tensors</span><span class="o">=</span><span class="s2">&quot;pt&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">image</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">processor</span><span class="p">(</span><span class="n">videos</span><span class="o">=</span><span class="n">raw_image</span><span class="p">,</span>
<span class="n">image</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">processor</span><span class="p">(</span><span class="n">videos</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">raw_image</span><span class="p">),</span>
<span class="n">text</span><span class="o">=</span><span class="n">prompt</span><span class="p">,</span>
<span class="n">return_tensors</span><span class="o">=</span><span class="s2">&quot;pt&quot;</span><span class="p">)</span>
@ -3408,9 +3412,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -957,9 +961,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -57,7 +57,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -67,7 +67,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -342,6 +342,8 @@
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -377,6 +379,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -1086,9 +1090,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -4,12 +4,30 @@ Executor
.. Here are files in the cpp/include/executor
.. We manually add subsection to enable detailed description in the future
.. It is also doable to automatically generate this file and list all the modules in the conf.py
cacheCommunicator.h
___________________
.. doxygenfile:: cacheCommunicator.h
:project: TensorRT-LLM
serialization.h
_______________
.. doxygenfile:: serialization.h
:project: TensorRT-LLM
disaggServerUtil.h
__________________
.. doxygenfile:: disaggServerUtil.h
:project: TensorRT-LLM
dataTransceiverState.h
______________________
.. doxygenfile:: dataTransceiverState.h
:project: TensorRT-LLM
tensor.h
________
@ -22,10 +40,10 @@ _______________
.. doxygenfile:: transferAgent.h
:project: TensorRT-LLM
serialization.h
_______________
executor.h
__________
.. doxygenfile:: serialization.h
.. doxygenfile:: executor.h
:project: TensorRT-LLM
types.h
@ -34,21 +52,3 @@ _______
.. doxygenfile:: types.h
:project: TensorRT-LLM
executor.h
__________
.. doxygenfile:: executor.h
:project: TensorRT-LLM
dataTransceiverState.h
______________________
.. doxygenfile:: dataTransceiverState.h
:project: TensorRT-LLM
cacheCommunicator.h
___________________
.. doxygenfile:: cacheCommunicator.h
:project: TensorRT-LLM

View File

@ -4,58 +4,22 @@ Runtime
.. Here are files in the cpp/include/runtime
.. We manually add subsection to enable detailed description in the future
.. It is also doable to automatically generate this file and list all the modules in the conf.py
lookaheadBuffers.h
__________________
.. doxygenfile:: lookaheadBuffers.h
:project: TensorRT-LLM
lookaheadModule.h
_________________
.. doxygenfile:: lookaheadModule.h
:project: TensorRT-LLM
iBuffer.h
decoderState.h
______________
.. doxygenfile:: decoderState.h
:project: TensorRT-LLM
request.h
_________
.. doxygenfile:: iBuffer.h
:project: TensorRT-LLM
modelConfig.h
_____________
.. doxygenfile:: modelConfig.h
:project: TensorRT-LLM
decodingOutput.h
________________
.. doxygenfile:: decodingOutput.h
:project: TensorRT-LLM
promptTuningParams.h
____________________
.. doxygenfile:: promptTuningParams.h
:project: TensorRT-LLM
bufferManager.h
_______________
.. doxygenfile:: bufferManager.h
:project: TensorRT-LLM
gptJsonConfig.h
_______________
.. doxygenfile:: gptJsonConfig.h
:project: TensorRT-LLM
runtimeDefaults.h
_________________
.. doxygenfile:: runtimeDefaults.h
.. doxygenfile:: request.h
:project: TensorRT-LLM
loraCache.h
@ -64,10 +28,34 @@ ___________
.. doxygenfile:: loraCache.h
:project: TensorRT-LLM
rawEngine.h
___________
bufferManager.h
_______________
.. doxygenfile:: rawEngine.h
.. doxygenfile:: bufferManager.h
:project: TensorRT-LLM
memoryCounters.h
________________
.. doxygenfile:: memoryCounters.h
:project: TensorRT-LLM
runtimeDefaults.h
_________________
.. doxygenfile:: runtimeDefaults.h
:project: TensorRT-LLM
ipcUtils.h
__________
.. doxygenfile:: ipcUtils.h
:project: TensorRT-LLM
tllmLogger.h
____________
.. doxygenfile:: tllmLogger.h
:project: TensorRT-LLM
gptDecoder.h
@ -76,34 +64,16 @@ ____________
.. doxygenfile:: gptDecoder.h
:project: TensorRT-LLM
eagleBuffers.h
______________
cudaEvent.h
___________
.. doxygenfile:: eagleBuffers.h
.. doxygenfile:: cudaEvent.h
:project: TensorRT-LLM
medusaModule.h
______________
modelConfig.h
_____________
.. doxygenfile:: medusaModule.h
:project: TensorRT-LLM
explicitDraftTokensBuffers.h
____________________________
.. doxygenfile:: explicitDraftTokensBuffers.h
:project: TensorRT-LLM
iTensor.h
_________
.. doxygenfile:: iTensor.h
:project: TensorRT-LLM
common.h
________
.. doxygenfile:: common.h
.. doxygenfile:: modelConfig.h
:project: TensorRT-LLM
loraCachePageManagerConfig.h
@ -118,22 +88,10 @@ _____________
.. doxygenfile:: worldConfig.h
:project: TensorRT-LLM
loraModule.h
____________
eagleModule.h
_____________
.. doxygenfile:: loraModule.h
:project: TensorRT-LLM
speculativeDecodingMode.h
_________________________
.. doxygenfile:: speculativeDecodingMode.h
:project: TensorRT-LLM
cudaEvent.h
___________
.. doxygenfile:: cudaEvent.h
.. doxygenfile:: eagleModule.h
:project: TensorRT-LLM
decodingInput.h
@ -142,40 +100,10 @@ _______________
.. doxygenfile:: decodingInput.h
:project: TensorRT-LLM
speculativeDecodingModule.h
___________________________
gptJsonConfig.h
_______________
.. doxygenfile:: speculativeDecodingModule.h
:project: TensorRT-LLM
iGptDecoderBatched.h
____________________
.. doxygenfile:: iGptDecoderBatched.h
:project: TensorRT-LLM
eagleModule.h
_____________
.. doxygenfile:: eagleModule.h
:project: TensorRT-LLM
tllmLogger.h
____________
.. doxygenfile:: tllmLogger.h
:project: TensorRT-LLM
gptDecoderBatched.h
___________________
.. doxygenfile:: gptDecoderBatched.h
:project: TensorRT-LLM
cudaStream.h
____________
.. doxygenfile:: cudaStream.h
.. doxygenfile:: gptJsonConfig.h
:project: TensorRT-LLM
ipcNvlsMemory.h
@ -190,27 +118,99 @@ ________________
.. doxygenfile:: samplingConfig.h
:project: TensorRT-LLM
request.h
_________
gptDecoderBatched.h
___________________
.. doxygenfile:: request.h
.. doxygenfile:: gptDecoderBatched.h
:project: TensorRT-LLM
decoderState.h
lookaheadBuffers.h
__________________
.. doxygenfile:: lookaheadBuffers.h
:project: TensorRT-LLM
loraModule.h
____________
.. doxygenfile:: loraModule.h
:project: TensorRT-LLM
promptTuningParams.h
____________________
.. doxygenfile:: promptTuningParams.h
:project: TensorRT-LLM
speculativeDecodingMode.h
_________________________
.. doxygenfile:: speculativeDecodingMode.h
:project: TensorRT-LLM
common.h
________
.. doxygenfile:: common.h
:project: TensorRT-LLM
medusaModule.h
______________
.. doxygenfile:: decoderState.h
.. doxygenfile:: medusaModule.h
:project: TensorRT-LLM
ipcUtils.h
__________
.. doxygenfile:: ipcUtils.h
:project: TensorRT-LLM
memoryCounters.h
decodingOutput.h
________________
.. doxygenfile:: memoryCounters.h
.. doxygenfile:: decodingOutput.h
:project: TensorRT-LLM
cudaStream.h
____________
.. doxygenfile:: cudaStream.h
:project: TensorRT-LLM
eagleBuffers.h
______________
.. doxygenfile:: eagleBuffers.h
:project: TensorRT-LLM
iGptDecoderBatched.h
____________________
.. doxygenfile:: iGptDecoderBatched.h
:project: TensorRT-LLM
speculativeDecodingModule.h
___________________________
.. doxygenfile:: speculativeDecodingModule.h
:project: TensorRT-LLM
explicitDraftTokensBuffers.h
____________________________
.. doxygenfile:: explicitDraftTokensBuffers.h
:project: TensorRT-LLM
rawEngine.h
___________
.. doxygenfile:: rawEngine.h
:project: TensorRT-LLM
iTensor.h
_________
.. doxygenfile:: iTensor.h
:project: TensorRT-LLM
iBuffer.h
_________
.. doxygenfile:: iBuffer.h
:project: TensorRT-LLM

View File

@ -16,8 +16,6 @@ An [architectural and performance overview](../../../docs/source/blogs/tech_blog
TRT-LLM uses some environment variables to control the behavior of disaggregated service.
* `TRTLLM_USE_UCX_KVCACHE`: Specifies whether to use UCX for KV cache transfer. The default value is `0`. This must be enabled when using a disaggregated service.
* `TRTLLM_PARALLEL_CACHE_SEND`: If set to `1`, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is `0`.
* `TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP`: If set to `1`, generationExecutor will not overlap KV cache transfer with model inference. The default value is `0`.
@ -66,55 +64,19 @@ A. Yes, it's recommended that different executor use different GPUs . We support
*Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`*
A. Please set the environment variables
```
export TRTLLM_USE_UCX_KVCACHE=1
A. please set `backendType` of `CacheTransceiverConfig`.
```cpp
ExecutorConfig executorConfig{...};
executorConfig.setCacheTransceiverConfig(texec::CacheTransceiverConfig(BackendType::DEFAULT));
```
*Q. Why do some profiling tools show that TRT-LLM's KV cache transfer does not utilize NVLink even on devices equipped with NVLink?*
A. Please check version of `UCX` with `ucx_info -v`.
If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
If the version of UCX >=1.18, there are several ways to enable NVLink:
1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda`, `UCX_CUDA_COPY_DMABUF=no`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`. $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will transfer the KV cache using `CUDA-aware MPI`. All executor processes involved must share the same MPI world communicator. Consequently, with `TRTLLM_USE_MPI_KVCACHE=1`, TRT-LLM only supports launching multiple executors via `MPI`. Additionally, the `CommunicationMode` for the executors must be set to `kLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for the `disaggregated-service`. These restrictions do not apply when `TRTLLM_USE_UCX_KVCACHE=1` is set.
*Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?*
A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer, but it is not enabled by default. There are several ways to enable GPU direct RDMA:
1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_RNDV_FRAG_MEM_TYPE=cuda`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer.
*Q. Are there any guidelines for performance tuning of KV cache transfer?*
*Q. What causes the substantial bandwidth fluctuations in kvCache transfers, especially during the first few requests following service initialization?*
A. Depending on the user's use case, certain sets of environment variables can help avoid poor KV cache transfer performance.
Environment Variable Set A
```
export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
export UCX_RNDV_FRAG_MEM_TYPES=cuda
export UCX_MEMTYPE_CACHE=n
export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
```
This set allows KV cache transfers to utilize NVLink within nodes and GDRDMA between nodes.
Environment Variable Set B
```
export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
export UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
export UCX_CUDA_COPY_DMABUF=no
export UCX_MEMTYPE_CACHE=n
export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
```
Set B may provide slightly better performance on a single node compared to Set A. However, when transferring KV cache across multiple nodes, it may cause program instability.
Environment Variable Set C
```
export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size
export UCX_MEMTYPE_CACHE=n
export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
```
Set C can achieve better performance than Sets A and B, both within and between nodes. However, if the KV cache size exceeds the specified $Size, performance may degrade.
A. The communication for kvCache transfer between executors are established dynamically. The connection establishment process incurs significant overhead, which explains the apparently lower kvCache transfer bandwidth observed during the initial requests after service startup. This lower bandwidth reflects the inclusion of connection establishment overhead. When conducting benchmarks, it is recommended to perform a warm-up phase to ensure accurate performance measurements.

View File

@ -3,13 +3,14 @@
- [About Speculative Sampling](#about-speculative-sampling)
- [Performance Improvements](#Performance-improvements)
- [Draft-Target-Model](#Draft-Target-Model)
- [Prompt-Lookup-Decoding](#prompt-lookup-decoding)
- [NGram](#ngram)
- [Medusa](#medusa)
- [Medusa Tree](#medusa-tree)
- [Using Medusa with TensorRT-LLM](#using-medusa-with-tensorrt-llm)
- [Limitations](#limitations)
- [ReDrafter](#redrafter)
- [EAGLE](#eagle)
- [Disaggregated Serving](#disaggregated-serving)
- [Lookahead decoding](#lookahead-decoding)
## About Speculative Sampling
@ -35,7 +36,7 @@ TensorRT-LLM supports several approaches for generating draft tokens, including:
1. [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper](https://arxiv.org/abs/2401.10774).
2. [Recurrent Drafter for Fast Speculative Decoding in Large Language Models](https://arxiv.org/html/2403.09919v1).
3. [EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty](https://arxiv.org/pdf/2401.15077).
3. Utilizing prompt tokens as draft tokens. For more information, refer to [Prompt Lookup Decoding](https://github.com/apoorvumang/prompt-lookup-decoding/).
3. Utilizing prompt tokens as draft tokens. For more information, refer to [NGram](https://github.com/apoorvumang/prompt-lookup-decoding/).
4. Utilizing Jacobi-like decoding to predict and verify draft tokens using the same model which does not need additional fine-tuning. Refer to [Break the Sequential Dependency of LLM Inference Using Lookahead Decoding](https://arxiv.org/pdf/2402.02057).
@ -61,13 +62,13 @@ Subsequently, the prompt, now updated with the accepted tokens, is sent back to
This iterative process continues until a predefined stop conditions are met.
An example of this orchestration process can be found in the [TensorRT-LLM Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py).
We provide two styles of running Draft-Target-Model now: using TensorRT-LLM-BLS in Triton Inference Server, or using TensorRT-LLM directly. Detailed steps of running can be found in [examples/draft_target_model/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/draft_target_model/README.md) and the code can be found in [examples/prompt_lookup/run_dtm_pld.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/run_dtm_pld.py).
We provide two styles of running Draft-Target-Model now: using TensorRT-LLM-BLS in Triton Inference Server, or using TensorRT-LLM directly. Detailed steps of running can be found in [examples/draft_target_model/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/draft_target_model/README.md) and the code can be found in [examples/ngram/run_dtm_ngram.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ngram/run_dtm_ngram.py).
## Prompt-Lookup-Decoding
## NGram
The Prompt-Lookup speculative decoding directly copies from the input prompt and previous generated output as draft tokens while generating the later output. It works like Draft-Target-Model but involves only one Target LLM model without further fine-tuning. The Prompt-Lookup profit from the scenarios which have high n-gram overlap between input prompt and output, such as summarization, document QA, multi-turn chat, code editing, etc.
The NGram speculative decoding directly copies from the input prompt and previous generated output as draft tokens while generating the later output. It works like Draft-Target-Model but involves only one Target LLM model without further fine-tuning. The NGram profit from the scenarios which have high n-gram overlap between input prompt and output, such as summarization, document QA, multi-turn chat, code editing, etc.
See document in [examples/prompt_lookup/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/README.md) and the code can be found in [examples/prompt_lookup/run_dtm_pld.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/prompt_lookup/run_dtm_pld.py).
See document in [examples/ngram/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ngram/README.md) and the code can be found in [examples/ngram/run_dtm_ngram.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ngram/run_dtm_ngram.py).
## Medusa
@ -169,6 +170,10 @@ The EAGLE approach enhances the single-model Medusa method by predicting and ver
Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits prediction, draft tokens acceptance and draft token generation are performed inside of the TensorRT engine. EAGLE-1 and EAGLE-2 are both supported, while EAGLE-2 is currently in the experimental stage. Please, visit the [EAGLE README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/eagle/README.md) for information about building and running the model.
### Disaggregated Serving
[Disaggregated Serving](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) with EAGLE3 using the two model approach is supported in the Pytorch backend. Please refer to the following [Dynamo example](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/llama4_plus_eagle.md) on how to run EAGLE3 with Disaggregated Serving for Llama 4 Maverick.
## Lookahead Decoding
Lookahead decoding algorithm operates through two parallel computation branches within the same model: a lookahead branch that generates n-grams using a fixed-sized 2D window, and a verification branch that validates promising n-gram candidates. This approach eliminates the necessity for additional model training or fine-tuning and can be enabled for any autoregressive model. Refer to the [Lookahead decoding README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/lookahead/README.md) for information about building and running the model.

View File

@ -138,7 +138,8 @@ YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF
cuda_graph_config: {}
moe_backend: TRTLLM
moe_config:
backend: TRTLLM
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
@ -196,7 +197,7 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
```bash
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 896
- 512
@ -263,7 +264,7 @@ YOUR_DATA_PATH=./dataset.txt
cat >./extra-llm-api-config.yml <<EOF
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2

View File

@ -124,7 +124,8 @@ YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF
cuda_graph_config: {}
moe_backend: TRTLLM
moe_config:
backend: TRTLLM
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
@ -179,7 +180,8 @@ YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF
cuda_graph_config: {}
moe_backend: TRTLLM
moe_config:
backend: TRTLLM
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3

View File

@ -157,7 +157,7 @@ These optimizations target the overall execution flow, scheduling, and resource
There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.
Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n padding_enabled: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
* Overlap Scheduler:

View File

@ -623,7 +623,8 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
cuda_graph_config: {}
moe_load_balancer: ./moe_load_balancer.yaml
moe_config:
load_balancer: ./moe_load_balancer.yaml
EOF
trtllm-llmapi-launch \

View File

@ -0,0 +1,164 @@
trtllm-bench
===========================
trtllm-bench is a comprehensive benchmarking tool for TensorRT-LLM engines. It provides three main subcommands for different benchmarking scenarios:
**Common Options for All Commands:**
**Usage:**
.. click:: tensorrt_llm.commands.bench:main
:prog: trtllm-bench
:nested: full
:commands: throughput, latency, build
prepare_dataset.py
===========================
trtllm-bench is designed to work with the `prepare_dataset.py <https://github.com/NVIDIA/TensorRT-LLM/blob/main/benchmarks/cpp/prepare_dataset.py>`_ script, which generates benchmark datasets in the required format. The prepare_dataset script supports:
**Dataset Types:**
- Real datasets from various sources
- Synthetic datasets with normal or uniform token distributions
- LoRA task-specific datasets
**Key Features:**
- Tokenizer integration for proper text preprocessing
- Configurable random seeds for reproducible results
- Support for LoRA adapters and task IDs
- Output in JSON format compatible with trtllm-bench
.. important::
The ``--stdout`` flag is **required** when using prepare_dataset.py with trtllm-bench to ensure proper data streaming format.
**Usage:**
prepare_dataset
-------------------
.. code-block:: bash
python prepare_dataset.py [OPTIONS]
**Options**
----
.. list-table::
:widths: 20 80
:header-rows: 1
* - Option
- Description
* - ``--tokenizer``
- Tokenizer directory or HuggingFace model name (required)
* - ``--output``
- Output JSON filename (default: preprocessed_dataset.json)
* - ``--stdout``
- Print output to stdout with JSON dataset entry on each line (**required for trtllm-bench**)
* - ``--random-seed``
- Random seed for token generation (default: 420)
* - ``--task-id``
- LoRA task ID (default: -1)
* - ``--rand-task-id``
- Random LoRA task range (two integers)
* - ``--lora-dir``
- Directory containing LoRA adapters
* - ``--log-level``
- Logging level: info or debug (default: info)
dataset
-------------------
Process real datasets from various sources.
.. code-block:: bash
python prepare_dataset.py dataset [OPTIONS]
**Options**
----
.. list-table::
:widths: 20 80
:header-rows: 1
* - Option
- Description
* - ``--input``
- Input dataset file or directory (required)
* - ``--max-input-length``
- Maximum input sequence length (default: 2048)
* - ``--max-output-length``
- Maximum output sequence length (default: 512)
* - ``--num-samples``
- Number of samples to process (default: all)
* - ``--format``
- Input format: json, jsonl, csv, or txt (default: auto-detect)
token_norm_dist
-------------------
Generate synthetic datasets with normal token distribution.
.. code-block:: bash
python prepare_dataset.py token_norm_dist [OPTIONS]
**Options**
----
.. list-table::
:widths: 20 80
:header-rows: 1
* - Option
- Description
* - ``--num-requests``
- Number of requests to be generated (required)
* - ``--input-mean``
- Normal distribution mean for input tokens (required)
* - ``--input-stdev``
- Normal distribution standard deviation for input tokens (required)
* - ``--output-mean``
- Normal distribution mean for output tokens (required)
* - ``--output-stdev``
- Normal distribution standard deviation for output tokens (required)
token_unif_dist
-------------------
Generate synthetic datasets with uniform token distribution
.. code-block:: bash
python prepare_dataset.py token_unif_dist [OPTIONS]
**Options**
----
.. list-table::
:widths: 20 80
:header-rows: 1
* - Option
- Description
* - ``--num-requests``
- Number of requests to be generated (required)
* - ``--input-min``
- Uniform distribution minimum for input tokens (required)
* - ``--input-max``
- Uniform distribution maximum for input tokens (required)
* - ``--output-min``
- Uniform distribution minimum for output tokens (required)
* - ``--output-max``
- Uniform distribution maximum for output tokens (required)

View File

@ -22,6 +22,8 @@ _____________
llm_logits_processor
llm_multilora
llm_speculative_decoding
llm_runtime
llm_sampling
Slurm
_____

View File

@ -3,6 +3,6 @@ Generate text with guided decoding
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_guided_decoding.py.
.. literalinclude:: ../../../examples/llm-api/llm_guided_decoding.py
:lines: 4-50
:lines: 4-47
:language: python
:linenos:

View File

@ -0,0 +1,8 @@
Runtime Configuration Examples
==============================
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_runtime.py.
.. literalinclude:: ../../../examples/llm-api/llm_runtime.py
:lines: 4-97
:language: python
:linenos:

View File

@ -0,0 +1,8 @@
Sampling Techniques Showcase
============================
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_sampling.py.
.. literalinclude:: ../../../examples/llm-api/llm_sampling.py
:lines: 4-229
:language: python
:linenos:

View File

@ -77,6 +77,7 @@ Welcome to TensorRT-LLM's Documentation!
:caption: Command-Line Reference
:hidden:
commands/trtllm-bench
commands/trtllm-build
commands/trtllm-serve

View File

@ -55,6 +55,12 @@ API Reference
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.MoeConfig
:members:
:undoc-members:
:show-inheritance:
:special-members: __init__
.. autoclass:: tensorrt_llm.llmapi.LookaheadDecodingConfig
:members:
:undoc-members:

View File

@ -201,7 +201,7 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
`llm_options.yml`
```yaml
cuda_graph_config:
padding_enabled: true
enable_padding: true
batch_sizes:
- 1
- 2

View File

@ -55,9 +55,27 @@ The array elements are: GPU type, YAML file (without extension), shard index, an
2. Search `jenkins/L0_Test.groovy` for a stage whose YAML file matches (for example `l0_a100`) and whose name contains `[Post-Merge]` if the YAML entry uses `stage: post_merge`.
3. The resulting stage name(s) are what you pass to Jenkins via the `stage_list` parameter when triggering a job.
### Example
### Using `test_to_stage_mapping.py`
Manually searching YAML and Groovy files can be tedious. The helper script
`scripts/test_to_stage_mapping.py` automates the lookup:
```bash
python scripts/test_to_stage_mapping.py --tests "triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]"
python scripts/test_to_stage_mapping.py --tests gpt_ib_ptuning
python scripts/test_to_stage_mapping.py --stages A100X-Triton-Post-Merge-1
python scripts/test_to_stage_mapping.py --test-list my_tests.txt
python scripts/test_to_stage_mapping.py --test-list my_tests.yml
```
The first two commands print the Jenkins stages that run the specified tests or
patterns. Patterns are matched by substring, so partial test names are
supported out of the box. The third lists every test executed in the given stage. When
providing tests on the command line, quote each test string so the shell does
not interpret the `[` and `]` characters as globs. Alternatively, store the
tests in a newlineseparated text file or a YAML list and supply it with
`--test-list`.
`triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]` appears in `l0_a100.yml` under `stage: post_merge` and `backend: triton`. The corresponding Jenkins stages are `A100X-Triton-[Post-Merge]-1` and `A100X-Triton-[Post-Merge]-2` (two shards).
To run the same tests on your pull request, comment:
@ -67,6 +85,7 @@ To run the same tests on your pull request, comment:
This executes the same tests that run post-merge for this hardware/backend.
## Waiving tests
Sometimes a test is known to fail due to a bug or unsupported feature. Instead

View File

@ -123,6 +123,7 @@ In addition, older architectures can have limitations for newer software release
- TensorRT-LLM requires Linux x86_64 or Linux aarch64.
* - GPU Model Architectures
-
- [NVIDIA GB200 NVL72](https://www.nvidia.com/en-us/data-center/gb200-nvl72/)
- [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
- [NVIDIA Grace Hopper Superchip](https://www.nvidia.com/en-us/data-center/grace-hopper-superchip/)
- [NVIDIA Hopper Architecture](https://www.nvidia.com/en-us/data-center/technologies/hopper-architecture/)

View File

@ -37,7 +37,7 @@ The single-step flow of PyExecutor involves:
The core component of `PyExecutor` is the `ModelEngine`, responsible for executing the model's forward pass efficiently on the GPU.
The key method of `ModelEngine` is `forward`, which handles the forward pass computation.
For the PyTorch backend, the derived class is `PyTorchModelEngine`, declared in [pytorch_model_engine.py](../../../tensorrt_llm/_torch/pyexecutor/pytorch_model_engine.py).
For the PyTorch backend, the derived class is `PyTorchModelEngine`, declared in [model_engine.py](../../../tensorrt_llm/_torch/pyexecutor/model_engine.py).
## Decoder

View File

@ -15,4 +15,4 @@
| KV Cache Reuse | Yes | Yes | Yes | Untested | Untested | Untested | Yes | No | Yes | Yes | --- | | | |
| Slide Window Attention | Yes | Yes | Yes | Untested | Untested | Untested | Untested | Untested | Yes | Yes | WIP | --- | | |
| Logits Post Processor | No | Yes | Yes | No | Untested | No | No | No | Yes | Yes | Yes | Yes | --- | |
| Guided Decoding | No | Yes | Yes | Untested | Yes | No | No | No | Yes | Yes | Yes | Yes | Yes | --- |
| Guided Decoding | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | Yes | --- |

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -70,7 +70,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -349,6 +349,8 @@
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -384,6 +386,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -509,7 +513,6 @@ This feature is currently experimental, and the related API is subjected to chan
<h2>Environment Variables<a class="headerlink" href="#environment-variables" title="Link to this heading">#</a></h2>
<p>TRT-LLM uses some environment variables to control the behavior of disaggregated service.</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_UCX_KVCACHE</span></code>: Specifies whether to use UCX for KV cache transfer. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>. This must be enabled when using a disaggregated service.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_PARALLEL_CACHE_SEND</span></code>: If set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP</span></code>: If set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, generationExecutor will not overlap KV cache transfer with model inference. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">TRTLLM_ENABLE_KVCACHE_RECEIVE_PARALLEL</span></code>: When the generation rank receives KV cache from multiple context ranks within a single context instance, it will receive KV cache from each rank sequentially. If set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, the generation rank will receive KV cache from each rank within one context instance in parallel. The default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
@ -540,50 +543,17 @@ This feature is currently experimental, and the related API is subjected to chan
<section id="debugging-faqs">
<h3>Debugging FAQs<a class="headerlink" href="#debugging-faqs" title="Link to this heading">#</a></h3>
<p><em>Q. How to handle error <code class="docutils literal notranslate"><span class="pre">Disaggregated</span> <span class="pre">serving</span> <span class="pre">is</span> <span class="pre">not</span> <span class="pre">enabled,</span> <span class="pre">please</span> <span class="pre">check</span> <span class="pre">the</span> <span class="pre">configuration?</span></code></em></p>
<p>A. Please set the environment variables</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">TRTLLM_USE_UCX_KVCACHE</span><span class="o">=</span><span class="mi">1</span>
<p>A. please set <code class="docutils literal notranslate"><span class="pre">backendType</span></code> of <code class="docutils literal notranslate"><span class="pre">CacheTransceiverConfig</span></code>.</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">ExecutorConfig</span><span class="w"> </span><span class="n">executorConfig</span><span class="p">{...};</span>
<span class="n">executorConfig</span><span class="p">.</span><span class="n">setCacheTransceiverConfig</span><span class="p">(</span><span class="n">texec</span><span class="o">::</span><span class="n">CacheTransceiverConfig</span><span class="p">(</span><span class="n">BackendType</span><span class="o">::</span><span class="n">DEFAULT</span><span class="p">));</span>
</pre></div>
</div>
<p><em>Q. Why do some profiling tools show that TRT-LLMs KV cache transfer does not utilize NVLink even on devices equipped with NVLink?</em></p>
<p>A. Please check version of <code class="docutils literal notranslate"><span class="pre">UCX</span></code> with <code class="docutils literal notranslate"><span class="pre">ucx_info</span> <span class="pre">-v</span></code>.
If the version of UCX &lt;=1.17, set the environment variables <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_FRAG_MEM_TYPE=cuda</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> to enable NVLink. For BlackWell architecture GPUs, UCX version &gt;=1.19 is required to enable NVLink.
If the version of UCX &gt;=1.18, there are several ways to enable NVLink:</p>
<ol class="arabic simple">
<li><p>Set the environment variables <code class="docutils literal notranslate"><span class="pre">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B</span></code>,<code class="docutils literal notranslate"><span class="pre">UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_CUDA_COPY_DMABUF=no</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_PIPELINE_ERROR_HANDLING=y</span></code>.</p></li>
<li><p>Set the environment variables <code class="docutils literal notranslate"><span class="pre">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_PIPELINE_ERROR_HANDLING=y</span></code>. $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.</p></li>
</ol>
<p>When the environment variable <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_MPI_KVCACHE=1</span></code> is set, TRT-LLM will transfer the KV cache using <code class="docutils literal notranslate"><span class="pre">CUDA-aware</span> <span class="pre">MPI</span></code>. All executor processes involved must share the same MPI world communicator. Consequently, with <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_MPI_KVCACHE=1</span></code>, TRT-LLM only supports launching multiple executors via <code class="docutils literal notranslate"><span class="pre">MPI</span></code>. Additionally, the <code class="docutils literal notranslate"><span class="pre">CommunicationMode</span></code> for the executors must be set to <code class="docutils literal notranslate"><span class="pre">kLEADER</span></code> or <code class="docutils literal notranslate"><span class="pre">kORCHESTRATOR</span></code> with <code class="docutils literal notranslate"><span class="pre">SpawnProcesses=false</span></code> for the <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code>. These restrictions do not apply when <code class="docutils literal notranslate"><span class="pre">TRTLLM_USE_UCX_KVCACHE=1</span></code> is set.</p>
<p><em>Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?</em></p>
<p>A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer, but it is not enabled by default. There are several ways to enable GPU direct RDMA:</p>
<ol class="arabic simple">
<li><p>Set the environment variables <code class="docutils literal notranslate"><span class="pre">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B</span></code>,<code class="docutils literal notranslate"><span class="pre">UCX_RNDV_FRAG_MEM_TYPE=cuda</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_PIPELINE_ERROR_HANDLING=y</span></code>.</p></li>
<li><p>Set the environment variables <code class="docutils literal notranslate"><span class="pre">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size</span></code>, <code class="docutils literal notranslate"><span class="pre">UCX_MEMTYPE_CACHE=n</span></code> and <code class="docutils literal notranslate"><span class="pre">UCX_RNDV_PIPELINE_ERROR_HANDLING=y</span></code>, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.</p></li>
</ol>
<p><em>Q. Are there any guidelines for performance tuning of KV cache transfer?</em></p>
<p>A. Depending on the users use case, certain sets of environment variables can help avoid poor KV cache transfer performance.</p>
<p>Environment Variable Set A</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE</span><span class="o">=</span><span class="mi">0</span><span class="n">B</span>
<span class="n">export</span> <span class="n">UCX_RNDV_FRAG_MEM_TYPES</span><span class="o">=</span><span class="n">cuda</span>
<span class="n">export</span> <span class="n">UCX_MEMTYPE_CACHE</span><span class="o">=</span><span class="n">n</span>
<span class="n">export</span> <span class="n">UCX_RNDV_PIPELINE_ERROR_HANDLING</span><span class="o">=</span><span class="n">y</span>
</pre></div>
</div>
<p>This set allows KV cache transfers to utilize NVLink within nodes and GDRDMA between nodes.</p>
<p>Environment Variable Set B</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE</span><span class="o">=</span><span class="mi">0</span><span class="n">B</span>
<span class="n">export</span> <span class="n">UCX_CUDA_COPY_ASYNC_MEM_TYPE</span><span class="o">=</span><span class="n">cuda</span>
<span class="n">export</span> <span class="n">UCX_CUDA_COPY_DMABUF</span><span class="o">=</span><span class="n">no</span>
<span class="n">export</span> <span class="n">UCX_MEMTYPE_CACHE</span><span class="o">=</span><span class="n">n</span>
<span class="n">export</span> <span class="n">UCX_RNDV_PIPELINE_ERROR_HANDLING</span><span class="o">=</span><span class="n">y</span>
</pre></div>
</div>
<p>Set B may provide slightly better performance on a single node compared to Set A. However, when transferring KV cache across multiple nodes, it may cause program instability.</p>
<p>Environment Variable Set C</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size
export UCX_MEMTYPE_CACHE=n
export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
</pre></div>
</div>
<p>Set C can achieve better performance than Sets A and B, both within and between nodes. However, if the KV cache size exceeds the specified $Size, performance may degrade.</p>
<p>A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer.</p>
<p><em>Q. What causes the substantial bandwidth fluctuations in kvCache transfers, especially during the first few requests following service initialization?</em></p>
<p>A. The communication for kvCache transfer between executors are established dynamically. The connection establishment process incurs significant overhead, which explains the apparently lower kvCache transfer bandwidth observed during the initial requests after service startup. This lower bandwidth reflects the inclusion of connection establishment overhead. When conducting benchmarks, it is recommended to perform a warm-up phase to ensure accurate performance measurements.</p>
</section>
</section>
</section>
@ -737,9 +707,9 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -70,7 +70,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -349,6 +349,8 @@
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -384,6 +386,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -798,9 +802,9 @@ the TensorRT-LLM C++ Executor API.</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -70,7 +70,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -349,6 +349,8 @@
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -384,6 +386,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -665,9 +669,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc3';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc4';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -70,7 +70,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.0.0rc3" />
<meta name="docsearch:version" content="1.0.0rc4" />
</head>
@ -349,6 +349,8 @@
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
@ -384,6 +386,8 @@
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
@ -983,9 +987,9 @@ is computed as:</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on July 14, 2025.</p>
<p>Last updated on July 19, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/cfcb97a">cfcb97a</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/69e9f6d">69e9f6d</a>.</p>
</div></div>

Some files were not shown because too many files have changed in this diff Show More