Update GitHub pages in root to v1.1.0rc3

This commit is contained in:
Kaiyu Xie 2025-09-04 03:19:11 +00:00
parent 247a5c27d1
commit 03a1561a27
217 changed files with 15608 additions and 14027 deletions

View File

@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 6e42667ce0c3f76b3f7a51cbd2d67bd7
config: 57da472845e6079ef61d1a59a2a83dc9
tags: 645f666f9bcd5a90fca523b33c5a78b7

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -316,7 +316,8 @@ class Attention(nn.Module):
has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
or self.o_proj.has_fp8_block_scales
or self.o_proj.has_fp8_rowwise)
if has_quant_scale and self.attn.has_fp8_kv_cache:
if has_quant_scale and (self.attn.has_fp8_kv_cache
or self.attn.has_fp4_kv_cache):
out_dtype = torch.float8_e4m3fn
output = q.new_empty([num_tokens, hidden_size], dtype=out_dtype)
return output
@ -361,6 +362,13 @@ class Attention(nn.Module):
if self.o_proj.has_nvfp4 and self.support_nvfp4_output and enable_attn_nvfp4_output:
out_scale_sf = self.o_proj.input_scale
kv_scales_sf = None
kv_scales_sf_inv = None
if self.quant_config is not None and self.quant_config.layer_quant_mode.has_fp4_kv_cache(
):
kv_scales_sf = self.qkv_proj.kv_scales
kv_scales_sf_inv = self.qkv_proj.inv_kv_scales
mrope_config = None
if mrope_rotary_cos_sin is not None or mrope_position_deltas is not None:
mrope_config = dict()
@ -376,6 +384,8 @@ class Attention(nn.Module):
attn_metadata,
out_scale=out_scale,
out_scale_sf=out_scale_sf,
kv_scales_sf=kv_scales_sf,
kv_scales_sf_inv=kv_scales_sf_inv,
attention_mask=attention_mask,
mrope_config=mrope_config,
attention_window_size=attention_window_size,

View File

@ -10,7 +10,7 @@ import traceback
import weakref
from abc import ABC, abstractmethod
from contextlib import contextmanager
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple
import torch
import torch._dynamo.config
@ -21,6 +21,7 @@ from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
from tensorrt_llm._torch.pyexecutor.sampler import SampleStateTensors
from tensorrt_llm._torch.speculative import (
get_num_extra_kv_tokens, update_spec_config_from_model_config)
from tensorrt_llm._torch.speculative.drafting_loops import ChainDrafter
from tensorrt_llm._torch.speculative.mtp import SampleStateTensorsMTP
from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
str_dtype_to_torch, torch_dtype_to_str,
@ -99,7 +100,7 @@ _KV_CACHE_MAP = {
"nvfp4": QuantAlgo.NVFP4.value,
"auto": "auto"
}
_VALID_KV_CACHE_DTYPES = ("fp8", "auto")
_VALID_KV_CACHE_DTYPES = ("fp8", "nvfp4", "auto")
def validate_and_set_mamba_ssm_cache_dtype(config: ModelConfig,
@ -276,6 +277,8 @@ class PyTorchModelEngine(ModelEngine):
spec_config: Optional["DecodingBaseConfig"] = None,
lora_config: Optional[LoraConfig] = None,
is_draft_model: bool = False,
drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
torch.nn.Module]] = None,
):
self.ub_buffers = None
self.batch_size = batch_size
@ -311,7 +314,8 @@ class PyTorchModelEngine(ModelEngine):
max_num_tokens=max_num_tokens,
moe_max_num_tokens=pytorch_backend_config.moe_max_num_tokens,
moe_load_balancer=pytorch_backend_config.moe_load_balancer,
lora_config=lora_config)
lora_config=lora_config,
drafting_loop_wrapper=drafting_loop_wrapper)
# In case that some tests use stub models and override `_load_model`.
if not hasattr(self.model, 'extra_attrs'):
self.model.extra_attrs = {}
@ -403,7 +407,7 @@ class PyTorchModelEngine(ModelEngine):
dtype=torch.int,
device='cuda')
self.without_logits = self.spec_config.spec_dec_mode.without_logits(
)
) or self.model_is_wrapped
self.max_draft_len = spec_config.max_draft_len
else:
self.without_logits = False
@ -562,6 +566,15 @@ class PyTorchModelEngine(ModelEngine):
# Reset the global cuda graph dummy request to None in warmup.
self.cuda_graph_runner.padding_dummy_request = None
def get_num_extra_decoding_steps():
if isinstance(self.model, ChainDrafter):
return self.model.max_draft_len
else:
assert not self.model_is_wrapped, (
f"Please add logic to determine num_extra_decoding_steps for drafting loop {type(self.model)}"
)
return 0
def get_cuda_graph_warmup_request(batch_size, draft_len):
# Divide by max_beam_width to get an approximation of the number of requests that can be run in parallel.
available_blocks = kv_cache_manager.get_num_free_blocks(
@ -569,6 +582,8 @@ class PyTorchModelEngine(ModelEngine):
if available_blocks >= batch_size:
result = ScheduledRequests()
result.context_requests = []
num_extra_decoding_steps = get_num_extra_decoding_steps()
# Add (batch_size - 1) dummy requests with seq_len=1.
# Should only need one more page per request.
requests = kv_cache_manager.add_dummy_requests(
@ -576,7 +591,8 @@ class PyTorchModelEngine(ModelEngine):
is_gen=True,
max_num_draft_tokens=draft_len,
use_mrope=use_mrope,
max_beam_width=self.max_beam_width)
max_beam_width=self.max_beam_width,
num_extra_decoding_steps=num_extra_decoding_steps)
# Divide by max_beam_width to get an approximation of the number of tokens that can be added to the final request.
available_tokens = kv_cache_manager.get_num_available_tokens(
draft_len)
@ -592,13 +608,20 @@ class PyTorchModelEngine(ModelEngine):
if max_position_embeddings is not None:
token_num = min(token_num,
max_position_embeddings - draft_len)
assert token_num > num_extra_decoding_steps, (
"Cannot fuse drafting loop. We do not have enough KV cache space "
"for all of the draft tokens.")
token_num -= num_extra_decoding_steps
max_seq_len_request = kv_cache_manager.add_dummy_requests(
request_ids=[batch_size - 1],
token_nums=[token_num],
is_gen=True,
max_num_draft_tokens=draft_len,
use_mrope=use_mrope,
max_beam_width=self.max_beam_width)[0]
max_beam_width=self.max_beam_width,
num_extra_decoding_steps=num_extra_decoding_steps)[0]
# Add the longest request before all other seq_len=1 request to simulate the padding CUDA graph case.
# This batch contains both the longest request and the shortest requests,
# it also contains the maximum number of requests and the maximum token number,
@ -620,6 +643,13 @@ class PyTorchModelEngine(ModelEngine):
if num_tokens > self.max_num_tokens or num_tokens > available_tokens:
return None
num_extra_decoding_steps = get_num_extra_decoding_steps()
if num_extra_decoding_steps > 0:
# Disable autotuning for fused drafting loops for now.
# There are a few bugs that can cause illegal memory accesses
# during warmup.
return None
num_ctx_tokens = num_tokens - num_gen_tokens
num_ctx_requests = 0
ctx_requests = []
@ -905,6 +935,8 @@ class PyTorchModelEngine(ModelEngine):
moe_max_num_tokens: Optional[int] = None,
moe_load_balancer: Optional[MoeLoadBalancerConfig] = None,
lora_config: Optional[LoraConfig] = None,
drafting_loop_wrapper: Optional[Callable[
[torch.nn.Module], torch.nn.Module]] = None,
**kwargs) -> DecoderModelForCausalLM:
config = checkpoint_loader.load_config(
checkpoint_dir,
@ -1008,6 +1040,13 @@ class PyTorchModelEngine(ModelEngine):
logger.info("moe_load_balancer finalize model done")
torch.cuda.current_stream().synchronize()
if drafting_loop_wrapper is not None:
model = drafting_loop_wrapper(model)
self.model_is_wrapped = True
else:
self.model_is_wrapped = False
return model
def _call_load_weights(self, load_method, weights, weight_mapper):

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -699,9 +703,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -2048,9 +2052,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -683,9 +687,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1408,9 +1412,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -801,9 +805,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -8785,9 +8789,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -657,9 +661,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -3520,9 +3524,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -664,9 +668,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -913,9 +917,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1380,9 +1384,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1228,9 +1232,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1254,9 +1258,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1018,9 +1022,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -673,9 +677,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -957,9 +961,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1680,9 +1684,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -3520,9 +3524,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -781,9 +785,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1159,9 +1163,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -891,9 +895,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1195,9 +1199,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -803,9 +807,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -820,9 +824,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1019,9 +1023,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -848,9 +852,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -679,9 +683,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -932,9 +936,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -830,9 +834,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -694,9 +698,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -820,9 +824,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -914,9 +918,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -996,9 +1000,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1032,9 +1036,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1968,9 +1972,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -2875,9 +2879,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -755,9 +759,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -917,9 +921,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -845,9 +849,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1037,9 +1041,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -964,9 +968,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1067,9 +1071,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -693,9 +697,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -845,9 +849,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -785,9 +789,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -919,9 +923,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1267,9 +1271,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1112,9 +1116,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -752,9 +756,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -902,9 +906,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -2213,9 +2217,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1279,9 +1283,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -2682,9 +2686,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -817,9 +821,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -751,9 +755,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -819,9 +823,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -822,9 +826,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -866,9 +870,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -962,9 +966,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1265,9 +1269,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -952,9 +956,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1493,9 +1497,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1095,9 +1099,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1908,9 +1912,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1176,9 +1180,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -5467,9 +5471,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1123,9 +1127,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1639,9 +1643,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1849,9 +1853,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -3439,9 +3443,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -983,9 +987,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -58,7 +58,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -68,7 +68,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -464,6 +464,10 @@
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1121,9 +1125,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -4,6 +4,24 @@ Executor
.. Here are files in the cpp/include/executor
.. We manually add subsection to enable detailed description in the future
.. It is also doable to automatically generate this file and list all the modules in the conf.py
transferAgent.h
_______________
.. doxygenfile:: transferAgent.h
:project: TensorRT-LLM
types.h
_______
.. doxygenfile:: types.h
:project: TensorRT-LLM
cacheCommunicator.h
___________________
.. doxygenfile:: cacheCommunicator.h
:project: TensorRT-LLM
disaggServerUtil.h
__________________
@ -16,24 +34,6 @@ ________
.. doxygenfile:: tensor.h
:project: TensorRT-LLM
transferAgent.h
_______________
.. doxygenfile:: transferAgent.h
:project: TensorRT-LLM
serialization.h
_______________
.. doxygenfile:: serialization.h
:project: TensorRT-LLM
types.h
_______
.. doxygenfile:: types.h
:project: TensorRT-LLM
executor.h
__________
@ -46,9 +46,9 @@ ______________________
.. doxygenfile:: dataTransceiverState.h
:project: TensorRT-LLM
cacheCommunicator.h
___________________
serialization.h
_______________
.. doxygenfile:: cacheCommunicator.h
.. doxygenfile:: serialization.h
:project: TensorRT-LLM

View File

@ -4,148 +4,22 @@ Runtime
.. Here are files in the cpp/include/runtime
.. We manually add subsection to enable detailed description in the future
.. It is also doable to automatically generate this file and list all the modules in the conf.py
lookaheadBuffers.h
__________________
.. doxygenfile:: lookaheadBuffers.h
:project: TensorRT-LLM
lookaheadModule.h
_________________
.. doxygenfile:: lookaheadModule.h
:project: TensorRT-LLM
iBuffer.h
_________
.. doxygenfile:: iBuffer.h
:project: TensorRT-LLM
modelConfig.h
_____________
.. doxygenfile:: modelConfig.h
:project: TensorRT-LLM
decodingOutput.h
________________
.. doxygenfile:: decodingOutput.h
:project: TensorRT-LLM
promptTuningParams.h
____________________
.. doxygenfile:: promptTuningParams.h
:project: TensorRT-LLM
bufferManager.h
_______________
.. doxygenfile:: bufferManager.h
:project: TensorRT-LLM
gptJsonConfig.h
_______________
.. doxygenfile:: gptJsonConfig.h
:project: TensorRT-LLM
runtimeDefaults.h
_________________
.. doxygenfile:: runtimeDefaults.h
:project: TensorRT-LLM
loraCache.h
___________
.. doxygenfile:: loraCache.h
:project: TensorRT-LLM
rawEngine.h
___________
.. doxygenfile:: rawEngine.h
:project: TensorRT-LLM
gptDecoder.h
____________
.. doxygenfile:: gptDecoder.h
:project: TensorRT-LLM
eagleBuffers.h
______________
.. doxygenfile:: eagleBuffers.h
:project: TensorRT-LLM
medusaModule.h
______________
.. doxygenfile:: medusaModule.h
:project: TensorRT-LLM
virtualMemory.h
_______________
.. doxygenfile:: virtualMemory.h
:project: TensorRT-LLM
explicitDraftTokensBuffers.h
____________________________
.. doxygenfile:: explicitDraftTokensBuffers.h
:project: TensorRT-LLM
iTensor.h
_________
.. doxygenfile:: iTensor.h
:project: TensorRT-LLM
common.h
________
.. doxygenfile:: common.h
:project: TensorRT-LLM
loraCachePageManagerConfig.h
____________________________
.. doxygenfile:: loraCachePageManagerConfig.h
:project: TensorRT-LLM
worldConfig.h
_____________
.. doxygenfile:: worldConfig.h
:project: TensorRT-LLM
loraModule.h
____________
.. doxygenfile:: loraModule.h
:project: TensorRT-LLM
speculativeDecodingMode.h
_________________________
.. doxygenfile:: speculativeDecodingMode.h
:project: TensorRT-LLM
cudaEvent.h
___________
.. doxygenfile:: cudaEvent.h
:project: TensorRT-LLM
decodingInput.h
virtualMemory.h
_______________
.. doxygenfile:: decodingInput.h
.. doxygenfile:: virtualMemory.h
:project: TensorRT-LLM
speculativeDecodingModule.h
@ -154,40 +28,10 @@ ___________________________
.. doxygenfile:: speculativeDecodingModule.h
:project: TensorRT-LLM
iGptDecoderBatched.h
____________________
common.h
________
.. doxygenfile:: iGptDecoderBatched.h
:project: TensorRT-LLM
eagleModule.h
_____________
.. doxygenfile:: eagleModule.h
:project: TensorRT-LLM
tllmLogger.h
____________
.. doxygenfile:: tllmLogger.h
:project: TensorRT-LLM
gptDecoderBatched.h
___________________
.. doxygenfile:: gptDecoderBatched.h
:project: TensorRT-LLM
cudaStream.h
____________
.. doxygenfile:: cudaStream.h
:project: TensorRT-LLM
ipcNvlsMemory.h
_______________
.. doxygenfile:: ipcNvlsMemory.h
.. doxygenfile:: common.h
:project: TensorRT-LLM
samplingConfig.h
@ -196,16 +40,136 @@ ________________
.. doxygenfile:: samplingConfig.h
:project: TensorRT-LLM
tllmLogger.h
____________
.. doxygenfile:: tllmLogger.h
:project: TensorRT-LLM
lookaheadModule.h
_________________
.. doxygenfile:: lookaheadModule.h
:project: TensorRT-LLM
modelConfig.h
_____________
.. doxygenfile:: modelConfig.h
:project: TensorRT-LLM
iGptDecoderBatched.h
____________________
.. doxygenfile:: iGptDecoderBatched.h
:project: TensorRT-LLM
cudaStream.h
____________
.. doxygenfile:: cudaStream.h
:project: TensorRT-LLM
loraCache.h
___________
.. doxygenfile:: loraCache.h
:project: TensorRT-LLM
medusaModule.h
______________
.. doxygenfile:: medusaModule.h
:project: TensorRT-LLM
decoderState.h
______________
.. doxygenfile:: decoderState.h
:project: TensorRT-LLM
ipcUtils.h
__________
lookaheadBuffers.h
__________________
.. doxygenfile:: ipcUtils.h
.. doxygenfile:: lookaheadBuffers.h
:project: TensorRT-LLM
eagleModule.h
_____________
.. doxygenfile:: eagleModule.h
:project: TensorRT-LLM
runtimeDefaults.h
_________________
.. doxygenfile:: runtimeDefaults.h
:project: TensorRT-LLM
decodingOutput.h
________________
.. doxygenfile:: decodingOutput.h
:project: TensorRT-LLM
decodingInput.h
_______________
.. doxygenfile:: decodingInput.h
:project: TensorRT-LLM
worldConfig.h
_____________
.. doxygenfile:: worldConfig.h
:project: TensorRT-LLM
gptDecoderBatched.h
___________________
.. doxygenfile:: gptDecoderBatched.h
:project: TensorRT-LLM
explicitDraftTokensBuffers.h
____________________________
.. doxygenfile:: explicitDraftTokensBuffers.h
:project: TensorRT-LLM
bufferManager.h
_______________
.. doxygenfile:: bufferManager.h
:project: TensorRT-LLM
loraModule.h
____________
.. doxygenfile:: loraModule.h
:project: TensorRT-LLM
eagleBuffers.h
______________
.. doxygenfile:: eagleBuffers.h
:project: TensorRT-LLM
speculativeDecodingMode.h
_________________________
.. doxygenfile:: speculativeDecodingMode.h
:project: TensorRT-LLM
promptTuningParams.h
____________________
.. doxygenfile:: promptTuningParams.h
:project: TensorRT-LLM
gptDecoder.h
____________
.. doxygenfile:: gptDecoder.h
:project: TensorRT-LLM
memoryCounters.h
@ -214,3 +178,39 @@ ________________
.. doxygenfile:: memoryCounters.h
:project: TensorRT-LLM
ipcNvlsMemory.h
_______________
.. doxygenfile:: ipcNvlsMemory.h
:project: TensorRT-LLM
rawEngine.h
___________
.. doxygenfile:: rawEngine.h
:project: TensorRT-LLM
ipcUtils.h
__________
.. doxygenfile:: ipcUtils.h
:project: TensorRT-LLM
iBuffer.h
_________
.. doxygenfile:: iBuffer.h
:project: TensorRT-LLM
gptJsonConfig.h
_______________
.. doxygenfile:: gptJsonConfig.h
:project: TensorRT-LLM
loraCachePageManagerConfig.h
____________________________
.. doxygenfile:: loraCachePageManagerConfig.h
:project: TensorRT-LLM

View File

@ -3,6 +3,6 @@ Runtime Configuration Examples
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_runtime.py.
.. literalinclude:: ../../../examples/llm-api/llm_runtime.py
:lines: 4-97
:lines: 4-96
:language: python
:linenos:

View File

@ -160,6 +160,12 @@ Welcome to TensorRT-LLM's Documentation!
blogs/XQA-kernel.md
blogs/tech_blog/*
.. toctree::
:maxdepth: 2
:caption: Use TensorRT Engine
:hidden:
legacy/tensorrt_quickstart.md
Indices and tables
==================

View File

@ -0,0 +1,9 @@
# LLM API with TensorRT Engine
A simple inference example with TinyLlama using the LLM API:
```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
:language: python
:linenos:
```
For more advanced usage including distributed inference, multimodal, and speculative decoding, please refer to this [README](../../../examples/llm-api/README.md).

View File

@ -280,7 +280,7 @@ API Reference
:special-members: __init__
:member-order: groupwise
:inherited-members:
:exclude-members: parse_obj,model_fields,model_fields_set,model_validate_json,dict,model_dump_json,model_computed_fields,parse_file,construct,model_copy,json,update_forward_refs,model_config,copy,from_orm,model_rebuild,model_validate,schema_json,parse_raw,model_dump,model_extra,model_post_init,validate,model_parametrized_name,model_json_schema,model_construct,schema,model_validate_strings
:exclude-members: model_extra,model_copy,model_validate_strings,model_dump_json,model_validate,copy,model_fields_set,construct,from_orm,json,model_construct,parse_raw,model_post_init,model_parametrized_name,schema,parse_obj,model_fields,model_validate_json,model_computed_fields,update_forward_refs,dict,model_json_schema,parse_file,model_dump,validate,schema_json,model_rebuild,model_config
.. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs
:members:
@ -289,7 +289,7 @@ API Reference
:special-members: __init__
:member-order: groupwise
:inherited-members:
:exclude-members: parse_obj,model_fields,model_fields_set,model_validate_json,dict,model_dump_json,model_computed_fields,parse_file,construct,model_copy,json,update_forward_refs,model_config,copy,from_orm,model_rebuild,model_validate,schema_json,parse_raw,model_dump,model_extra,model_post_init,validate,model_parametrized_name,model_json_schema,model_construct,schema,model_validate_strings
:exclude-members: model_extra,model_copy,model_validate_strings,model_dump_json,model_validate,copy,model_fields_set,construct,from_orm,json,model_construct,parse_raw,model_post_init,model_parametrized_name,schema,parse_obj,model_fields,model_validate_json,model_computed_fields,update_forward_refs,dict,model_json_schema,parse_file,model_dump,validate,schema_json,model_rebuild,model_config
.. autoclass:: tensorrt_llm.llmapi.AutoDecodingConfig
:members:

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -733,9 +737,9 @@ This feature is currently in prototype, and the related API is subjected to chan
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -824,9 +828,9 @@ the TensorRT-LLM C++ Executor API.</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -690,9 +694,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1007,9 +1011,9 @@ is computed as:</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1050,9 +1054,9 @@ The <code class="docutils literal notranslate"><span class="pre">GptDecoder</spa
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -870,9 +874,9 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -775,9 +779,9 @@ An “event” is any significant change in the lifecycle or state of a KV cache
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -748,9 +752,9 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -862,9 +866,9 @@ The shape of <code class="docutils literal notranslate"><span class="pre">LoraWe
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -69,7 +69,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -469,6 +469,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -711,9 +715,9 @@ This feature is optimized for PCIe-based GPU topologies and may affect model acc
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -69,7 +69,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -469,6 +469,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -666,9 +670,9 @@ Note that support for these static libraries will be gradually deprioritized in
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -862,9 +866,9 @@ However, similar to any new model, you can follow the same approach to define yo
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -69,7 +69,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -469,6 +469,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -699,9 +703,9 @@ python3<span class="w"> </span>examples/summarize.py<span class="w"> </span><spa
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -761,9 +765,9 @@ python<span class="w"> </span>../summarize.py<span class="w"> </span>--engine_di
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1028,9 +1032,9 @@ trtllm-build<span class="w"> </span>--checkpoint_dir<span class="w"> </span>./op
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -71,7 +71,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -471,6 +471,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -1032,9 +1036,9 @@ srun<span class="w"> </span><span class="se">\</span>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

View File

@ -59,7 +59,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc2';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -69,7 +69,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="1.1.0rc2" />
<meta name="docsearch:version" content="1.1.0rc3" />
</head>
@ -469,6 +469,10 @@
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
</div>
@ -950,9 +954,9 @@ The support for Qwen-1 is in <code class="docutils literal notranslate"><span cl
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on August 29, 2025.</p>
<p>Last updated on September 02, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/15ec2b8">15ec2b8</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/e81c50d">e81c50d</a>.</p>
</div></div>

Some files were not shown because too many files have changed in this diff Show More