Update latest GitHub pages to v1.0.0rc6

2026-01-14 06:27:45 +08:00 · 2025-08-07 06:26:14 +00:00 · 2025-08-07 06:26:14 +00:00 · ae05ac189f
commit ae05ac189f
parent dea56538b2
213 changed files with 20643 additions and 13287 deletions
--- a/latest/.buildinfo
+++ b/latest/.buildinfo
@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 04d569d8861c27285138a24e2af3e496
+config: 4c4e434803756ce4857c43609ad607a5
 tags: 645f666f9bcd5a90fca523b33c5a78b7
--- a/latest/_cpp_gen/executor.html
+++ b/latest/_cpp_gen/executor.html
--- a/latest/_cpp_gen/runtime.html
+++ b/latest/_cpp_gen/runtime.html
--- a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
+++ b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
@ -10,7 +10,8 @@ from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping

 from ..attention_backend import (AttentionInputType, AttentionMetadata,
-                                 TrtllmAttention, TrtllmAttentionMetadata)
+                                 FlashInferAttentionMetadata, TrtllmAttention,
+                                 TrtllmAttentionMetadata)
 from ..attention_backend.interface import (AttentionMask,
                                           PositionalEmbeddingParams,
                                           PredefinedAttentionMask)
@ -18,13 +19,83 @@ from ..attention_backend.utils import create_attention, get_attention_backend
 from ..distributed import AllReduceParams
 from ..model_config import ModelConfig
 from ..peft.lora.layer import LoraLayer, LoraModuleType
-from ..utils import Fp4QuantizedTensor, get_model_extra_attrs
+from ..utils import (Fp4QuantizedTensor, get_model_extra_attrs,
+                     is_torch_compiling)
 from .linear import Linear, TensorParallelMode, WeightMode, WeightsLoadingConfig
 from .multi_stream_utils import maybe_execute_in_parallel
 from .rms_norm import RMSNorm
 from .rotary_embedding import RotaryEmbedding


+def extract_extra_attrs(layer_idx: str, attn_type: str):
+    assert attn_type in ["mla", "attn"], "Invalid attention type"
+    extra_attrs = get_model_extra_attrs()
+    assert extra_attrs is not None, "Model extra attrs is not set"
+
+    metadata_ref = extra_attrs.get("attention_metadata", None)
+    assert metadata_ref is not None, "Attention metadata is not set"
+    metadata = metadata_ref()
+    if attn_type == "mla":
+        assert isinstance(
+            metadata,
+            TrtllmAttentionMetadata,
+        )
+    else:
+        assert isinstance(
+            metadata,
+            FlashInferAttentionMetadata,
+        ) or isinstance(
+            metadata,
+            TrtllmAttentionMetadata,
+        )
+
+    attn_layers = extra_attrs.get(attn_type + "_layers", None)
+    assert attn_layers is not None, "Attention layer is not registered"
+    attn_layer_ref = attn_layers.get(layer_idx, None)
+    assert attn_layer_ref is not None, f"Cannot find attention layer for layer {layer_idx}"
+    attn_layer = attn_layer_ref()
+
+    if attn_type == "mla":
+        assert isinstance(
+            attn_layer,
+            MLA), "MLA layer must be a subclass of MLA or an instance of MLA"
+    elif attn_type == "attn":
+        assert isinstance(
+            attn_layer, Attention
+        ), "Attention layer must be a subclass of Attention or an instance of Attention"
+
+    return metadata, attn_layer
+
+
+@torch.library.custom_op("trtllm::attn_custom_op_inplace",
+                         mutates_args=("output", ))
+def attn_custom_op_inplace(
+    q: torch.Tensor,
+    k: Optional[torch.Tensor],
+    v: Optional[torch.Tensor],
+    attention_mask: str,
+    mrope_rotary_cos_sin: Optional[torch.Tensor],
+    mrope_position_deltas: Optional[torch.Tensor],
+    attention_window_size: Optional[int],
+    attention_mask_data: Optional[torch.Tensor],
+    layer_idx: str,
+    output: torch.Tensor,
+) -> None:
+    metadata, attn_layer = extract_extra_attrs(layer_idx, "attn")
+    # NVFP4 output cannot be supported by torch compile for TRTLLM backend.
+    attn_layer._attn_impl(q,
+                          k,
+                          v,
+                          metadata,
+                          PredefinedAttentionMask(attention_mask),
+                          mrope_rotary_cos_sin,
+                          mrope_position_deltas,
+                          attention_window_size,
+                          attention_mask_data,
+                          False,
+                          output=output)
+
+
 class Attention(nn.Module):

    def __init__(
@ -64,6 +135,16 @@ class Attention(nn.Module):
        """
        super().__init__()
        self.layer_idx = layer_idx
+        self.layer_idx_str = str(layer_idx)
+
+        self.register_to_config = False
+        # We only register TRTLLM attention layers to config.
+        if config is not None:
+            if "attn_layers" not in config.extra_attrs:
+                config.extra_attrs["attn_layers"] = {}
+            config.extra_attrs["attn_layers"][self.layer_idx_str] = weakref.ref(
+                self)
+            self.register_to_config = True

        config = config or ModelConfig()
        self.hidden_size = hidden_size
@ -222,6 +303,75 @@ class Attention(nn.Module):
            q, k, v = qkv, None, None
        return q, k, v

+    def create_output(self, q: torch.Tensor):
+        num_tokens = q.shape[0]
+        hidden_size = self.o_proj.in_features
+        out_dtype = q.dtype
+
+        if self.attn_backend == "TRTLLM":
+            has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
+                               or self.o_proj.has_fp8_block_scales
+                               or self.o_proj.has_fp8_rowwise)
+            if has_quant_scale and self.attn.has_fp8_kv_cache:
+                out_dtype = torch.float8_e4m3fn
+        output = q.new_empty([num_tokens, hidden_size], dtype=out_dtype)
+        return output
+
+    def _attn_impl(
+        self,
+        q: torch.Tensor,
+        k: Optional[torch.Tensor],
+        v: Optional[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        attention_mask: AttentionMask,
+        mrope_rotary_cos_sin: Optional[torch.Tensor],
+        mrope_position_deltas: Optional[torch.Tensor],
+        attention_window_size: Optional[int],
+        attention_mask_data: Optional[torch.Tensor],
+        enable_attn_nvfp4_output: bool = True,
+        output: Optional[torch.Tensor] = None,
+        output_sf: Optional[torch.Tensor] = None,
+    ):
+
+        out_scale = None
+        out_scale_sf = None
+        has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
+                           or self.o_proj.has_fp8_block_scales
+                           or self.o_proj.has_fp8_rowwise)
+        if has_quant_scale:
+            out_scale = self.o_proj.inv_input_scale
+        if self.o_proj.has_nvfp4 and self.support_nvfp4_output and enable_attn_nvfp4_output:
+            out_scale_sf = self.o_proj.input_scale
+
+        mrope_config = None
+        if mrope_rotary_cos_sin is not None or mrope_position_deltas is not None:
+            mrope_config = dict()
+            if mrope_rotary_cos_sin is not None:
+                mrope_config["mrope_rotary_cos_sin"] = mrope_rotary_cos_sin
+            if mrope_position_deltas is not None:
+                mrope_config["mrope_position_deltas"] = mrope_position_deltas
+
+        attn_output = self.attn.forward(
+            q,
+            k,
+            v,
+            attn_metadata,
+            out_scale=out_scale,
+            out_scale_sf=out_scale_sf,
+            attention_mask=attention_mask,
+            mrope_config=mrope_config,
+            attention_window_size=attention_window_size,
+            attention_mask_data=attention_mask_data,
+            enable_attn_nvfp4_output=enable_attn_nvfp4_output,
+            output=output,
+            output_sf=output_sf)
+        if isinstance(attn_output, tuple):
+            assert len(
+                attn_output
+            ) == 2, "attn_output should be a tuple of (output, output_sf)"
+            return attn_output[0], attn_output[1]
+        return attn_output, None
+
    def forward(
        self,
        position_ids: Optional[torch.IntTensor],
@ -264,31 +414,56 @@ class Attention(nn.Module):
            if qkv_lora is not None:
                qkv = qkv + qkv_lora

+        mrope_rotary_cos_sin = None
+        mrope_position_deltas = None
+        if mrope_config is not None:
+            if "mrope_rotary_cos_sin" in mrope_config:
+                mrope_rotary_cos_sin = mrope_config["mrope_rotary_cos_sin"]
+            if "mrope_position_deltas" in mrope_config:
+                mrope_position_deltas = mrope_config["mrope_position_deltas"]
+
+        output = None
+
        q, k, v = qkv, None, None
-
        q, k, v = self.apply_rope(q, k, v, position_ids)
-
-        out_scale = None
-        out_scale_sf = None
-        if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales or self.o_proj.has_fp8_rowwise:
-            out_scale = self.o_proj.inv_input_scale
-        if self.o_proj.has_nvfp4 and self.support_nvfp4_output:
-            out_scale_sf = self.o_proj.input_scale
-
        q, k, v = self.convert_qkv(q, k, v)
-        attn_output = self.attn.forward(
-            q,
-            k,
-            v,
-            attn_metadata,
-            out_scale=out_scale,
-            out_scale_sf=out_scale_sf,
-            attention_mask=attention_mask,
-            mrope_config=mrope_config,
-            attention_window_size=attention_window_size,
-            attention_mask_data=attention_mask_data)
-        hidden_states = attn_output
-        attn_output = self.o_proj(attn_output,
+
+        # Currently only TRTLLM and FLASHINFER are torch compile compatible backends.
+        # Only enable custom inplace op when torch compiling.
+        use_custom_inplace_op = (self.register_to_config
+                                 and (self.attn_backend == "TRTLLM"
+                                      or self.attn_backend == "FLASHINFER")
+                                 and is_torch_compiling())
+        if use_custom_inplace_op:
+            output = self.create_output(q)
+            attn_custom_op_inplace(
+                q,
+                k,
+                v,
+                attention_mask,
+                mrope_rotary_cos_sin,
+                mrope_position_deltas,
+                attention_window_size,
+                attention_mask_data,
+                self.layer_idx_str,
+                output=output,
+            )
+        else:
+            output, output_sf = self._attn_impl(
+                q,
+                k,
+                v,
+                attn_metadata,
+                attention_mask,
+                mrope_rotary_cos_sin,
+                mrope_position_deltas,
+                attention_window_size,
+                attention_mask_data,
+            )
+            if output_sf is not None:
+                output = Fp4QuantizedTensor(output, output_sf)
+
+        attn_output = self.o_proj(output,
                                  all_reduce_params=all_reduce_params,
                                  lora_params=lora_params,
                                  layer_idx=self.layer_idx)
@ -316,30 +491,6 @@ class Attention(nn.Module):
        return q, k, v


-def extract_extra_attrs(layer_idx: str):
-    extra_attrs = get_model_extra_attrs()
-    assert extra_attrs is not None, "Model extra attrs is not set"
-
-    metadata_ref = extra_attrs.get("attention_metadata", None)
-    assert metadata_ref is not None, "Attention metadata is not set"
-    metadata = metadata_ref()
-    assert isinstance(
-        metadata,
-        TrtllmAttentionMetadata,
-    )
-
-    mla_layers = extra_attrs.get("mla_layers", None)
-    assert mla_layers is not None, "MLA layers is not registered"
-    mla_layer_ref = mla_layers.get(layer_idx, None)
-    assert mla_layer_ref is not None, f"Cannot find MLA layer for layer {layer_idx}"
-    mla_layer = mla_layer_ref()
-    assert isinstance(
-        mla_layer,
-        MLA), "MLA layer must be a subclass of MLA or an instance of MLA"
-
-    return metadata, mla_layer
-
-
@torch.library.custom_op("trtllm::mla_custom_op_inplace",
                         mutates_args=("output", ))
 def mla_custom_op_inplace(
@ -348,7 +499,7 @@ def mla_custom_op_inplace(
    layer_idx: str,
    output: torch.Tensor,
 ) -> None:
-    metadata, mla_layer = extract_extra_attrs(layer_idx)
+    metadata, mla_layer = extract_extra_attrs(layer_idx, "mla")
    mla_layer.forward_impl(position_ids, hidden_states, metadata, output=output)


--- a/latest/_downloads/b6815cf245cc7dc7a26a6f727fdc2dc4/model.py
+++ b/latest/_downloads/b6815cf245cc7dc7a26a6f727fdc2dc4/model.py
@ -90,11 +90,15 @@ class QWenDecoderLayer(Module):
        if config.moe.has_moe():
            mlp_kwargs = {'moe_config': config.moe, 'mapping': config.mapping}
            if config.qwen_type == 'qwen2_moe':
+                # Qwen2 MoE uses SharedMoE with shared expert
                ClsMLP = SharedMoE
                mlp_kwargs['use_shared_gate'] = True
                mlp_kwargs['use_side_stream'] = True
                mlp_kwargs['moe_config'].shared_expert_intermediate_size = \
                    config.moe_shared_expert_intermediate_size
+            elif config.qwen_type == 'qwen3_moe':
+                # Qwen3 MoE uses standard MOE without shared expert
+                ClsMLP = MOE
            else:
                ClsMLP = MOE
        else:
@ -104,7 +108,7 @@ class QWenDecoderLayer(Module):
        # Qwen's real inter_size depends on qwen_type
        if self.config.qwen_type == 'qwen':
            intermediate_size = config.intermediate_size // 2
-        elif self.config.qwen_type == 'qwen2_moe':
+        elif self.config.qwen_type in ('qwen2_moe', 'qwen3_moe'):
            intermediate_size = config.moe_intermediate_size
        else:
            intermediate_size = config.intermediate_size
@ -264,18 +268,11 @@ class QWenForCausalLM(DecoderModelForCausalLM):
                "mlp_4h_to_h": "mlp.c_proj",
                "mlp_gate": "w1",
            }
-        elif config.qwen_type == 'qwen2_moe':
+        elif config.qwen_type in ('qwen2_moe', 'qwen3_moe'):
            self.trtllm_modules_to_hf_modules = copy.copy(
                get_default_trtllm_modules_to_hf_modules())
+            # Common MoE expert mappings for both Qwen2 and Qwen3 MoE
            self.trtllm_modules_to_hf_modules.update({
-                "mlp_h_to_4h":
-                "mlp.shared_expert.gate_proj",
-                "mlp_4h_to_h":
-                "mlp.shared_expert.down_proj",
-                "mlp_gate":
-                "mlp.shared_expert.up_proj",
-                "mlp_router":
-                "mlp.shared_expert_gate",
                "moe_h_to_4h":
                "mlp.experts.gate_proj",
                "moe_4h_to_h":
@ -283,6 +280,18 @@ class QWenForCausalLM(DecoderModelForCausalLM):
                "moe_gate":
                "mlp.experts.up_proj",
            })
+            # Qwen2 MoE additionally has shared expert
+            if config.qwen_type == 'qwen2_moe':
+                self.trtllm_modules_to_hf_modules.update({
+                    "mlp_h_to_4h":
+                    "mlp.shared_expert.gate_proj",
+                    "mlp_4h_to_h":
+                    "mlp.shared_expert.down_proj",
+                    "mlp_gate":
+                    "mlp.shared_expert.up_proj",
+                    "mlp_router":
+                    "mlp.shared_expert_gate",
+                })
        else:
            self.trtllm_modules_to_hf_modules = None
        super().__init__(config, transformer, lm_head)
@ -343,6 +352,12 @@ class QWenForCausalLM(DecoderModelForCausalLM):
                    "mlp.shared_expert_gate": "mlp.shared_expert_gate",
                    "fc": ["up_proj", "gate_proj"],
                }
+            elif config.qwen_type == "qwen3_moe":
+                custom_dict = {
+                    "fc": ["up_proj", "gate_proj"],
+                    "q_layernorm": "q_norm",
+                    "k_layernorm": "k_norm",
+                }
            elif config.qwen_type in {"qwen2", "qwen2_vl"
                                      } and config.tie_word_embeddings:
                custom_dict = {"lm_head": "model.embed_tokens"}
@ -360,7 +375,7 @@ class QWenForCausalLM(DecoderModelForCausalLM):
                    "transformer": "language_model.model",
                    "lm_head": "language_model.lm_head",
                }
-            elif config.qwen_type in ("qwen3", "qwen3_moe"):
+            elif config.qwen_type == "qwen3":
                custom_dict = {
                    "q_layernorm": "q_norm",
                    "k_layernorm": "k_norm",
@ -412,7 +427,7 @@ class QWenForCausalLM(DecoderModelForCausalLM):
                            loader.load(tllm_key,
                                        custom_postprocess_kwargs=arg_dict))
                loader.fill(tllm_weights)
-            elif config.qwen_type == "qwen2_moe":
+            elif config.qwen_type in ("qwen2_moe", "qwen3_moe"):
                for tllm_key, _ in model.named_parameters():
                    sub_module = model
                    for attr in tllm_key.split(".")[:-1]:
--- a/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py
+++ b/latest/_downloads/c68095123d889975e6e5e839a4241d22/model_engine.py
@ -316,6 +316,10 @@ class PyTorchModelEngine(ModelEngine):
        self._init_model_capacity()

        self._torch_compile_backend = None
+        self._torch_compile_enabled = pytorch_backend_config.torch_compile_enabled
+        self._torch_compile_piecewise_cuda_graph = (
+            pytorch_backend_config.torch_compile_piecewise_cuda_graph
+            and not self.enable_attention_dp)

        try:
            if pytorch_backend_config.torch_compile_enabled:
@ -325,8 +329,8 @@ class PyTorchModelEngine(ModelEngine):
                self._torch_compile_backend = Backend(
                    pytorch_backend_config.torch_compile_inductor_enabled,
                    enable_userbuffers=use_ub,
-                    enable_piecewise_cuda_graph=pytorch_backend_config.
-                    torch_compile_piecewise_cuda_graph,
+                    enable_piecewise_cuda_graph=self.
+                    _torch_compile_piecewise_cuda_graph,
                    cuda_graph_batch_sizes=pytorch_backend_config.
                    cuda_graph_batch_sizes,
                    max_num_streams=pytorch_backend_config.
@ -350,8 +354,6 @@ class PyTorchModelEngine(ModelEngine):
            import traceback
            traceback.print_exception(Exception, e, e.__traceback__)
            raise e
-        self._torch_compile_enabled = pytorch_backend_config.torch_compile_enabled
-        self._torch_compile_piecewise_cuda_graph = pytorch_backend_config.torch_compile_piecewise_cuda_graph

        self.attn_backend = get_attention_backend(attn_backend)

@ -658,7 +660,6 @@ class PyTorchModelEngine(ModelEngine):
                               self._torch_compile_backend)

                self._torch_compile_backend.enable_optimization()
-                set_enable_piecewise_cuda_graph_capture_flag(True)

                # Disable cuda graph capture here so that we can properly capture it later
                with self.no_cuda_graph():
@ -746,26 +747,28 @@ class PyTorchModelEngine(ModelEngine):
                                     resource_manager=resource_manager)
                        torch.cuda.synchronize()

-                    if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
-                        with self.no_cuda_graph():
-                            with release_batch(
-                                    get_torch_compile_warmup_request(
-                                        1, bs)) as batch:
-                                logger.info(
-                                    f"Run piecewise CUDA graph warmup for batch size={bs}"
-                                )
-
-                                for _ in range(3):
-                                    self.forward(
-                                        batch,
-                                        new_tensors_device=None,
-                                        resource_manager=resource_manager)
+            if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
+                for seq_lens in cuda_graph_batch_sizes:
+                    set_enable_piecewise_cuda_graph_capture_flag(True)
+                    with self.no_cuda_graph():
+                        with release_batch(
+                                get_torch_compile_warmup_request(
+                                    1, seq_lens)) as batch:
+                            logger.info(
+                                f"Run piecewise CUDA graph warmup for seq_lens={seq_lens}"
+                            )
+                            # self.model.mtp_worker.stored_input_ids = []
+                            for _ in range(3):
                                self.forward(batch,
                                             new_tensors_device=None,
                                             resource_manager=resource_manager)
-                                torch.cuda.synchronize()
-                                gc.collect()
-                                torch.cuda.empty_cache()
+                            self.forward(batch,
+                                         new_tensors_device=None,
+                                         resource_manager=resource_manager)
+                            torch.cuda.synchronize()
+                            gc.collect()
+                            torch.cuda.empty_cache()
+                    set_enable_piecewise_cuda_graph_capture_flag(False)

        # Set the value back to the original value
        self.enable_spec_decode = self.is_spec_decode
@ -993,8 +996,7 @@ class PyTorchModelEngine(ModelEngine):
                    moe_max_num_tokens: Optional[int] = None,
                    moe_load_balancer: Optional[MoeLoadBalancerConfig] = None,
                    lora_config: Optional[LoraConfig] = None,
-                    **kwargs):
-
+                    **kwargs) -> DecoderModelForCausalLM:
        config = checkpoint_loader.load_config(
            checkpoint_dir,
            trust_remote_code=True,
--- a/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
+++ b/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
@ -187,6 +187,23 @@ class MoeConfig(StrictBaseModel):
        return cls(**data)


+class AttentionDpConfig(StrictBaseModel):
+    """
+    Configuration for attention DP.
+    """
+    enable_balance: bool = Field(default=False,
+                                 description="Whether to enable balance.")
+    timeout_iters: int = Field(
+        default=50, description="The number of iterations to timeout.")
+    batching_wait_iters: int = Field(
+        default=10,
+        description="The number of iterations to wait for batching.")
+
+    @classmethod
+    def from_dict(cls, data: dict):
+        return cls(**data)
+
+
@dataclass
 class _ParallelConfig:
    ''' The model distribution configs for LLM.  '''
@ -1988,6 +2005,11 @@ class TorchLlmArgs(BaseLlmArgs):
         Note that each CUDA graph can use up to 200 MB of extra memory.",
        status="beta")

+    attention_dp_config: Optional[AttentionDpConfig] = Field(
+        default=None,
+        description="Optimized load-balancing for the DP Attention scheduler.",
+        status="beta")
+
    disable_overlap_scheduler: bool = Field(
        default=False,
        description="Disable the overlap scheduler.",
@ -2253,6 +2275,29 @@ class TorchLlmArgs(BaseLlmArgs):

        return self

+    @model_validator(mode='after')
+    def validate_attention_dp_config(self) -> 'TorchLlmArgs':
+        """Validate attention DP configuration.
+
+        Ensures that:
+        1. If attention_dp_config.enable_balance is true, attention_dp_config.batching_wait_iters must be greater or equal to 0
+        2. If attention_dp_config.enable_balance is true, attention_dp_config.timeout_iters must be greater or equal to 0
+        """
+        if self.attention_dp_config is None:
+            return self
+
+        config = self.attention_dp_config
+        if config.enable_balance:
+            if config.batching_wait_iters < 0:
+                raise ValueError(
+                    "attention_dp_config.batching_wait_iters must be greater or equal to 0 when enable_balance is true"
+                )
+            if config.timeout_iters < 0:
+                raise ValueError(
+                    "attention_dp_config.timeout_iters must be greater or equal to 0 when enable_balance is true"
+                )
+        return self
+
    # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
    def get_pytorch_backend_config(self) -> "PyTorchConfig":
        from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
@ -2303,7 +2348,16 @@ class TorchLlmArgs(BaseLlmArgs):
            enable_min_latency=self.enable_min_latency,
            stream_interval=self.stream_interval,
            force_dynamic_quantization=self.force_dynamic_quantization,
-            allreduce_strategy=self.allreduce_strategy)
+            allreduce_strategy=self.allreduce_strategy,
+            attention_dp_enable_balance=bool(
+                self.attention_dp_config is not None
+                and self.attention_dp_config.enable_balance),
+            attention_dp_time_out_iters=self.attention_dp_config.timeout_iters
+            if self.attention_dp_config is not None else
+            AttentionDpConfig.model_fields['timeout_iters'].default,
+            attention_dp_batching_wait_iters=self.attention_dp_config.
+            batching_wait_iters if self.attention_dp_config is not None else
+            AttentionDpConfig.model_fields['batching_wait_iters'].default)


 def update_llm_args_with_extra_dict(
@ -2320,6 +2374,7 @@ def update_llm_args_with_extra_dict(
        "speculative_config": DecodingBaseConfig,
        "lora_config": LoraConfig,
        "moe_config": MoeConfig,
+        "attention_dp_config": AttentionDpConfig,
    }
    for field_name, field_type in field_mapping.items():
        if field_name in llm_args_dict:
--- a/latest/_modules/index.html
+++ b/latest/_modules/index.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -678,9 +682,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/builder.html
+++ b/latest/_modules/tensorrt_llm/builder.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -2028,9 +2032,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/disaggregated_params.html
+++ b/latest/_modules/tensorrt_llm/disaggregated_params.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -658,9 +662,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/executor/result.html
+++ b/latest/_modules/tensorrt_llm/executor/result.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1302,9 +1306,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/executor/utils.html
+++ b/latest/_modules/tensorrt_llm/executor/utils.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -781,9 +785,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/functional.html
+++ b/latest/_modules/tensorrt_llm/functional.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -8754,9 +8758,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/activation.html
+++ b/latest/_modules/tensorrt_llm/layers/activation.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -637,9 +641,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/attention.html
+++ b/latest/_modules/tensorrt_llm/layers/attention.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -3502,9 +3506,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/cast.html
+++ b/latest/_modules/tensorrt_llm/layers/cast.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -644,9 +648,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/conv.html
+++ b/latest/_modules/tensorrt_llm/layers/conv.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -893,9 +897,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/embedding.html
+++ b/latest/_modules/tensorrt_llm/layers/embedding.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1360,9 +1364,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/linear.html
+++ b/latest/_modules/tensorrt_llm/layers/linear.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1208,9 +1212,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/mlp.html
+++ b/latest/_modules/tensorrt_llm/layers/mlp.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1234,9 +1238,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/normalization.html
+++ b/latest/_modules/tensorrt_llm/layers/normalization.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -998,9 +1002,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/layers/pooling.html
+++ b/latest/_modules/tensorrt_llm/layers/pooling.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -653,9 +657,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/llmapi/build_cache.html
+++ b/latest/_modules/tensorrt_llm/llmapi/build_cache.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -937,9 +941,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/llmapi/llm.html
+++ b/latest/_modules/tensorrt_llm/llmapi/llm.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -530,6 +534,7 @@
                      <span class="n">create_input_processor_with_hash</span><span class="p">,</span> <span class="n">prompt_inputs</span><span class="p">)</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..sampling_params</span><span class="w"> </span><span class="kn">import</span> <span class="n">SamplingParams</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">..scheduling_params</span><span class="w"> </span><span class="kn">import</span> <span class="n">SchedulingParams</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">TORCH_LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span>
                       <span class="n">TRT_LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">,</span>
                       <span class="n">PeftCacheConfig</span><span class="p">,</span> <span class="n">PybindMirror</span><span class="p">,</span> <span class="n">TorchLlmArgs</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">)</span>
@ -742,6 +747,8 @@
            <span class="n">KvCacheRetentionConfig</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">KvCacheRetentionConfig</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
        <span class="n">disaggregated_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span>
            <span class="n">DisaggregatedParams</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">DisaggregatedParams</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">scheduling_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">SchedulingParams</span><span class="p">,</span>
+                                          <span class="n">List</span><span class="p">[</span><span class="n">SchedulingParams</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
    <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">RequestOutput</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">RequestOutput</span><span class="p">]]:</span>
 <span class="w">        </span><span class="sd">&quot;&quot;&quot;Generate output for the given prompts in the synchronous mode.</span>
 <span class="sd">        Synchronous generation accepts either single prompt or batched prompts.</span>
@ -760,6 +767,8 @@
 <span class="sd">                Configuration for the request&#39;s retention in the KV Cache. Defaults to None.</span>
 <span class="sd">            disaggregated_params (tensorrt_llm.disaggregated_params.DisaggregatedParams, Sequence[tensorrt_llm.disaggregated_params.DisaggregatedParams], optional):</span>
 <span class="sd">                Disaggregated parameters. Defaults to None.</span>
+<span class="sd">            scheduling_params (tensorrt_llm.scheduling_params.SchedulingParams, List[tensorrt_llm.scheduling_params.SchedulingParams], optional):</span>
+<span class="sd">                Scheduling parameters. Defaults to None.</span>
 <span class="sd">        Returns:</span>
 <span class="sd">            Union[tensorrt_llm.llmapi.RequestOutput, List[tensorrt_llm.llmapi.RequestOutput]]: The output data of the completion request to the LLM.</span>
 <span class="sd">        &quot;&quot;&quot;</span>
@ -789,6 +798,7 @@
                <span class="n">kv_cache_retention_config</span><span class="o">=</span><span class="n">_item_at</span><span class="p">(</span><span class="n">kv_cache_retention_config</span><span class="p">,</span>
                                                   <span class="n">i</span><span class="p">),</span>
                <span class="n">disaggregated_params</span><span class="o">=</span><span class="n">_item_at</span><span class="p">(</span><span class="n">disaggregated_params</span><span class="p">,</span> <span class="n">i</span><span class="p">),</span>
+                <span class="n">scheduling_params</span><span class="o">=</span><span class="n">_item_at</span><span class="p">(</span><span class="n">scheduling_params</span><span class="p">,</span> <span class="n">i</span><span class="p">),</span>
                <span class="n">streaming</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
            <span class="n">futures</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">future</span><span class="p">)</span>

@ -814,6 +824,7 @@
        <span class="n">kv_cache_retention_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">KvCacheRetentionConfig</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
        <span class="n">disaggregated_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">DisaggregatedParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
        <span class="n">_postproc_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">PostprocParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">scheduling_params</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">SchedulingParams</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
    <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">RequestOutput</span><span class="p">:</span>
 <span class="w">        </span><span class="sd">&quot;&quot;&quot;Generate output for the given prompt in the asynchronous mode.</span>
 <span class="sd">        Asynchronous generation accepts single prompt only.</span>
@ -827,6 +838,7 @@
 <span class="sd">            streaming (bool): Whether to use the streaming mode for the generation. Defaults to False.</span>
 <span class="sd">            kv_cache_retention_config (tensorrt_llm.bindings.executor.KvCacheRetentionConfig, optional): Configuration for the request&#39;s retention in the KV Cache. Defaults to None.</span>
 <span class="sd">            disaggregated_params (tensorrt_llm.disaggregated_params.DisaggregatedParams, optional): Disaggregated parameters. Defaults to None.</span>
+<span class="sd">            scheduling_params (tensorrt_llm.scheduling_params.SchedulingParams, optional): Scheduling parameters. Defaults to None.</span>

 <span class="sd">        Returns:</span>
 <span class="sd">            tensorrt_llm.llmapi.RequestOutput: The output data of the completion request to the LLM.</span>
@ -932,6 +944,7 @@
            <span class="n">disaggregated_params</span><span class="o">=</span><span class="n">disaggregated_params</span><span class="p">,</span>
            <span class="n">postproc_params</span><span class="o">=</span><span class="n">_postproc_params</span><span class="p">,</span>
            <span class="n">multimodal_params</span><span class="o">=</span><span class="n">multimodal_params</span><span class="p">,</span>
+            <span class="n">scheduling_params</span><span class="o">=</span><span class="n">scheduling_params</span><span class="p">,</span>
        <span class="p">)</span>

        <span class="k">return</span> <span class="n">RequestOutput</span><span class="o">.</span><span class="n">_from_generation_result</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">prompt</span><span class="p">,</span>
@ -1063,6 +1076,14 @@
                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
                    <span class="sa">f</span><span class="s2">&quot;PyTorch backend currently only supports `logprobs=1`. Received `logprobs=</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span><span class="si">}</span><span class="s2">` (Top</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span><span class="si">}</span><span class="s2"> logprobs). Please set `logprobs=1` in `sampling_params` instead.&quot;</span>
                <span class="p">)</span>
+            <span class="c1"># Check prompt length and query length against max_num_tokens to filter illegal requests.</span>
+            <span class="c1"># Skip check for gen-only requests</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;pytorch&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">enable_chunked_prefill</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">is_gen_only</span><span class="p">:</span>
+                <span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_num_tokens</span>
+                <span class="k">if</span> <span class="n">max_num_tokens</span> <span class="ow">and</span> <span class="n">prompt_len</span> <span class="o">/</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span> <span class="o">+</span> <span class="n">query_len</span> <span class="o">&gt;</span> <span class="n">max_num_tokens</span><span class="p">:</span>
+                    <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                        <span class="sa">f</span><span class="s2">&quot;The sum of prompt length (</span><span class="si">{</span><span class="n">prompt_len</span><span class="o">/</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">), query length (</span><span class="si">{</span><span class="n">query_len</span><span class="si">}</span><span class="s2">) should not exceed &quot;</span>
+                        <span class="sa">f</span><span class="s2">&quot;max_num_tokens (</span><span class="si">{</span><span class="n">max_num_tokens</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
            <span class="k">return</span>

        <span class="n">build_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
@ -1079,7 +1100,7 @@
            <span class="p">(</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span> <span class="ow">or</span> <span class="mi">0</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">max_seq_len</span><span class="p">):</span>
            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
                <span class="sa">f</span><span class="s2">&quot;The sum of prompt length (</span><span class="si">{</span><span class="n">prompt_len</span><span class="o">/</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">) and query length (</span><span class="si">{</span><span class="n">query_len</span><span class="si">}</span><span class="s2">) max_tokens (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span><span class="si">}</span><span class="s2">) should not exceed &quot;</span>
-                <span class="sa">f</span><span class="s2">&quot;max_seq_len (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
+                <span class="sa">f</span><span class="s2">&quot;max_seq_len (</span><span class="si">{</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>

        <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
            <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">==</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="p">:</span>
@ -1732,9 +1753,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/llmapi/llm_args.html
+++ b/latest/_modules/tensorrt_llm/llmapi/llm_args.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -699,6 +703,29 @@



+<div class="viewcode-block" id="AttentionDpConfig">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.AttentionDpConfig">[docs]</a>
+<span class="k">class</span><span class="w"> </span><span class="nc">AttentionDpConfig</span><span class="p">(</span><span class="n">StrictBaseModel</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Configuration for attention DP.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">enable_balance</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+                                 <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Whether to enable balance.&quot;</span><span class="p">)</span>
+    <span class="n">timeout_iters</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The number of iterations to timeout.&quot;</span><span class="p">)</span>
+    <span class="n">batching_wait_iters</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The number of iterations to wait for batching.&quot;</span><span class="p">)</span>
+
+<div class="viewcode-block" id="AttentionDpConfig.from_dict">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.AttentionDpConfig.from_dict">[docs]</a>
+    <span class="nd">@classmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
+        <span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
+</div>
+
+
+
 <span class="nd">@dataclass</span>
 <span class="k">class</span><span class="w"> </span><span class="nc">_ParallelConfig</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&#39;&#39;&#39; The model distribution configs for LLM.  &#39;&#39;&#39;</span>
@ -2640,6 +2667,11 @@
 <span class="s2">         Note that each CUDA graph can use up to 200 MB of extra memory.&quot;</span><span class="p">,</span>
        <span class="n">status</span><span class="o">=</span><span class="s2">&quot;beta&quot;</span><span class="p">)</span>

+    <span class="n">attention_dp_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">AttentionDpConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Optimized load-balancing for the DP Attention scheduler.&quot;</span><span class="p">,</span>
+        <span class="n">status</span><span class="o">=</span><span class="s2">&quot;beta&quot;</span><span class="p">)</span>
+
    <span class="n">disable_overlap_scheduler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Disable the overlap scheduler.&quot;</span><span class="p">,</span>
@ -2929,6 +2961,32 @@
        <span class="k">return</span> <span class="bp">self</span></div>


+<div class="viewcode-block" id="TorchLlmArgs.validate_attention_dp_config">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_attention_dp_config">[docs]</a>
+    <span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s1">&#39;after&#39;</span><span class="p">)</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">validate_attention_dp_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s1">&#39;TorchLlmArgs&#39;</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Validate attention DP configuration.</span>
+
+<span class="sd">        Ensures that:</span>
+<span class="sd">        1. If attention_dp_config.enable_balance is true, attention_dp_config.batching_wait_iters must be greater or equal to 0</span>
+<span class="sd">        2. If attention_dp_config.enable_balance is true, attention_dp_config.timeout_iters must be greater or equal to 0</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">return</span> <span class="bp">self</span>
+
+        <span class="n">config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span>
+        <span class="k">if</span> <span class="n">config</span><span class="o">.</span><span class="n">enable_balance</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">config</span><span class="o">.</span><span class="n">batching_wait_iters</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="s2">&quot;attention_dp_config.batching_wait_iters must be greater or equal to 0 when enable_balance is true&quot;</span>
+                <span class="p">)</span>
+            <span class="k">if</span> <span class="n">config</span><span class="o">.</span><span class="n">timeout_iters</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="s2">&quot;attention_dp_config.timeout_iters must be greater or equal to 0 when enable_balance is true&quot;</span>
+                <span class="p">)</span>
+        <span class="k">return</span> <span class="bp">self</span></div>
+
+
    <span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
 <div class="viewcode-block" id="TorchLlmArgs.get_pytorch_backend_config">
 <a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config">[docs]</a>
@ -2981,7 +3039,16 @@
            <span class="n">enable_min_latency</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_min_latency</span><span class="p">,</span>
            <span class="n">stream_interval</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">stream_interval</span><span class="p">,</span>
            <span class="n">force_dynamic_quantization</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">force_dynamic_quantization</span><span class="p">,</span>
-            <span class="n">allreduce_strategy</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">allreduce_strategy</span><span class="p">)</span></div>
+            <span class="n">allreduce_strategy</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">allreduce_strategy</span><span class="p">,</span>
+            <span class="n">attention_dp_enable_balance</span><span class="o">=</span><span class="nb">bool</span><span class="p">(</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
+                <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span><span class="o">.</span><span class="n">enable_balance</span><span class="p">),</span>
+            <span class="n">attention_dp_time_out_iters</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span><span class="o">.</span><span class="n">timeout_iters</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span>
+            <span class="n">AttentionDpConfig</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="s1">&#39;timeout_iters&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">default</span><span class="p">,</span>
+            <span class="n">attention_dp_batching_wait_iters</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span><span class="o">.</span>
+            <span class="n">batching_wait_iters</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">attention_dp_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span>
+            <span class="n">AttentionDpConfig</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="s1">&#39;batching_wait_iters&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">default</span><span class="p">)</span></div>
 </div>


@ -3000,6 +3067,7 @@
        <span class="s2">&quot;speculative_config&quot;</span><span class="p">:</span> <span class="n">DecodingBaseConfig</span><span class="p">,</span>
        <span class="s2">&quot;lora_config&quot;</span><span class="p">:</span> <span class="n">LoraConfig</span><span class="p">,</span>
        <span class="s2">&quot;moe_config&quot;</span><span class="p">:</span> <span class="n">MoeConfig</span><span class="p">,</span>
+        <span class="s2">&quot;attention_dp_config&quot;</span><span class="p">:</span> <span class="n">AttentionDpConfig</span><span class="p">,</span>
    <span class="p">}</span>
    <span class="k">for</span> <span class="n">field_name</span><span class="p">,</span> <span class="n">field_type</span> <span class="ow">in</span> <span class="n">field_mapping</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
        <span class="k">if</span> <span class="n">field_name</span> <span class="ow">in</span> <span class="n">llm_args_dict</span><span class="p">:</span>
@ -3175,9 +3243,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
+++ b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1139,9 +1143,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/baichuan/model.html
+++ b/latest/_modules/tensorrt_llm/models/baichuan/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -871,9 +875,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/bert/model.html
+++ b/latest/_modules/tensorrt_llm/models/bert/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1175,9 +1179,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/bloom/model.html
+++ b/latest/_modules/tensorrt_llm/models/bloom/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -783,9 +787,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/chatglm/config.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -800,9 +804,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/chatglm/model.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -999,9 +1003,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/clip/model.html
+++ b/latest/_modules/tensorrt_llm/models/clip/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -828,9 +832,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/cogvlm/config.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -659,9 +663,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/cogvlm/model.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -912,9 +916,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/commandr/model.html
+++ b/latest/_modules/tensorrt_llm/models/commandr/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -810,9 +814,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/dbrx/config.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -674,9 +678,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/dbrx/model.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -800,9 +804,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -894,9 +898,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -976,9 +980,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/dit/model.html
+++ b/latest/_modules/tensorrt_llm/models/dit/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1012,9 +1016,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/eagle/model.html
+++ b/latest/_modules/tensorrt_llm/models/eagle/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1948,9 +1952,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/enc_dec/model.html
+++ b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -2853,9 +2857,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/falcon/config.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -735,9 +739,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/falcon/model.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -897,9 +901,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/gemma/config.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -825,9 +829,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/gemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1017,9 +1021,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/gpt/config.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -944,9 +948,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/gpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1047,9 +1051,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/gptj/config.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -673,9 +677,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/gptj/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -825,9 +829,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/gptneox/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptneox/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -765,9 +769,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/llama/config.html
+++ b/latest/_modules/tensorrt_llm/models/llama/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -899,9 +903,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/llama/model.html
+++ b/latest/_modules/tensorrt_llm/models/llama/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1247,9 +1251,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/mamba/model.html
+++ b/latest/_modules/tensorrt_llm/models/mamba/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1092,9 +1096,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/medusa/config.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -732,9 +736,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/medusa/model.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -882,9 +886,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/mllama/model.html
+++ b/latest/_modules/tensorrt_llm/models/mllama/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -2193,9 +2197,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
+++ b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1259,9 +1263,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/modeling_utils.html
+++ b/latest/_modules/tensorrt_llm/models/modeling_utils.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -2660,9 +2664,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/mpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/mpt/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -797,9 +801,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -731,9 +735,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -799,9 +803,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/opt/model.html
+++ b/latest/_modules/tensorrt_llm/models/opt/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -802,9 +806,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/phi/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -846,9 +850,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/phi3/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi3/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -942,9 +946,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1245,9 +1249,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/models/redrafter/model.html
+++ b/latest/_modules/tensorrt_llm/models/redrafter/model.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -932,9 +936,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/plugin/plugin.html
+++ b/latest/_modules/tensorrt_llm/plugin/plugin.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1473,9 +1477,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/quantization/mode.html
+++ b/latest/_modules/tensorrt_llm/quantization/mode.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1041,9 +1045,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
+++ b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1888,9 +1892,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1156,9 +1160,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/runtime/generation.html
+++ b/latest/_modules/tensorrt_llm/runtime/generation.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -5444,9 +5448,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
+++ b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1103,9 +1107,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/runtime/model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1619,9 +1623,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -1829,9 +1833,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -3414,9 +3418,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/runtime/session.html
+++ b/latest/_modules/tensorrt_llm/runtime/session.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -963,9 +967,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_modules/tensorrt_llm/sampling_params.html
+++ b/latest/_modules/tensorrt_llm/sampling_params.html
@ -58,7 +58,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -68,7 +68,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -384,7 +384,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -502,7 +506,7 @@
 <span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">abc</span><span class="w"> </span><span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">fields</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">NamedTuple</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">List</span><span class="p">,</span> <span class="n">NamedTuple</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span>

 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="n">BaseModel</span>
@ -611,55 +615,6 @@
        <span class="k">pass</span>  <span class="c1"># noqa</span>


-<span class="k">class</span><span class="w"> </span><span class="nc">LogitBiasLogitsProcessor</span><span class="p">(</span><span class="n">LogitsProcessor</span><span class="p">):</span>
-    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">logit_bias</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
-        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">logit_bias</span> <span class="o">=</span> <span class="n">logit_bias</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">tokens_to_adjust</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">process_logit_bias</span><span class="p">(</span><span class="n">logit_bias</span><span class="p">)</span>
-        <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">tokens_to_adjust</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Empty logit_bias provided - no tokens to adjust&quot;</span><span class="p">)</span>
-
-    <span class="k">def</span><span class="w"> </span><span class="nf">process_logit_bias</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">logit_bias</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">]:</span>
-        <span class="n">valid</span> <span class="o">=</span> <span class="p">{}</span>
-        <span class="n">invalid</span> <span class="o">=</span> <span class="p">{}</span>
-
-        <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">logit_bias</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
-            <span class="k">try</span><span class="p">:</span>
-                <span class="n">token_id</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">k</span><span class="p">)</span>
-                <span class="n">valid</span><span class="p">[</span><span class="n">token_id</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span>
-            <span class="k">except</span> <span class="p">(</span><span class="ne">ValueError</span><span class="p">,</span> <span class="ne">TypeError</span><span class="p">):</span>
-                <span class="n">invalid</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span>
-
-        <span class="k">if</span> <span class="n">invalid</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="sa">f</span><span class="s2">&quot;Invalid token_ids in logit_bias: </span><span class="si">{</span><span class="nb">list</span><span class="p">(</span><span class="n">invalid</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span><span class="si">}</span><span class="s2">. &quot;</span>
-                <span class="sa">f</span><span class="s2">&quot;All keys must be integers.&quot;</span>
-            <span class="p">)</span>
-        <span class="k">return</span> <span class="n">valid</span>
-
-    <span class="k">def</span><span class="w"> </span><span class="fm">__call__</span><span class="p">(</span>
-        <span class="bp">self</span><span class="p">,</span>
-        <span class="n">req_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-        <span class="n">logits</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span>
-        <span class="n">token_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]],</span>
-        <span class="n">stream_ptr</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
-        <span class="n">client_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span>
-    <span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
-        <span class="n">vocab_size</span> <span class="o">=</span> <span class="n">logits</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
-        <span class="n">token_ids_list</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokens_to_adjust</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
-        <span class="n">bias_values</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokens_to_adjust</span><span class="o">.</span><span class="n">values</span><span class="p">()),</span> <span class="n">device</span><span class="o">=</span><span class="n">logits</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
-
-        <span class="n">invalid_token_ids</span> <span class="o">=</span> <span class="p">[</span><span class="n">tid</span> <span class="k">for</span> <span class="n">tid</span> <span class="ow">in</span> <span class="n">token_ids_list</span> <span class="k">if</span> <span class="n">tid</span> <span class="o">&gt;=</span> <span class="n">vocab_size</span><span class="p">]</span>
-        <span class="k">if</span> <span class="n">invalid_token_ids</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="sa">f</span><span class="s2">&quot;Token ID(s) </span><span class="si">{</span><span class="n">invalid_token_ids</span><span class="si">}</span><span class="s2"> exceed vocabulary size (vocab_size=</span><span class="si">{</span><span class="n">vocab_size</span><span class="si">}</span><span class="s2">)&quot;</span>
-            <span class="p">)</span>
-
-        <span class="n">stream</span> <span class="o">=</span> <span class="kc">None</span> <span class="k">if</span> <span class="n">stream_ptr</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">ExternalStream</span><span class="p">(</span><span class="n">stream_ptr</span><span class="p">)</span>
-        <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">stream</span><span class="p">(</span><span class="n">stream</span><span class="p">):</span>
-            <span class="n">logits</span><span class="p">[:,</span> <span class="p">:,</span> <span class="n">token_ids_list</span><span class="p">]</span> <span class="o">+=</span> <span class="n">bias_values</span>
-
-
 <span class="nd">@dataclass</span><span class="p">(</span><span class="n">slots</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">kw_only</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 <span class="k">class</span><span class="w"> </span><span class="nc">AdditionalModelOutput</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;An additional output to gather from the model.</span>
@ -833,6 +788,12 @@

        <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span>

+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span><span class="o">.</span><span class="n">detach</span><span class="p">()</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">embedding_bias</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+
        <span class="bp">self</span><span class="o">.</span><span class="n">_validate</span><span class="p">()</span>

    <span class="k">def</span><span class="w"> </span><span class="nf">_validate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
@ -1141,9 +1102,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/_sources/_cpp_gen/executor.rst.txt
+++ b/latest/_sources/_cpp_gen/executor.rst.txt
@ -4,24 +4,6 @@ Executor
 .. Here are files in the cpp/include/executor
 .. We manually add subsection to enable detailed description in the future
 .. It is also doable to automatically generate this file and list all the modules in the conf.py
-transferAgent.h
-_______________
-
-.. doxygenfile:: transferAgent.h
-   :project: TensorRT-LLM
-
-types.h
-_______
-
-.. doxygenfile:: types.h
-   :project: TensorRT-LLM
-
-cacheCommunicator.h
-___________________
-
-.. doxygenfile:: cacheCommunicator.h
-   :project: TensorRT-LLM
-
 disaggServerUtil.h
 __________________

@ -34,6 +16,24 @@ ________
 .. doxygenfile:: tensor.h
   :project: TensorRT-LLM

+transferAgent.h
+_______________
+
+.. doxygenfile:: transferAgent.h
+   :project: TensorRT-LLM
+
+serialization.h
+_______________
+
+.. doxygenfile:: serialization.h
+   :project: TensorRT-LLM
+
+types.h
+_______
+
+.. doxygenfile:: types.h
+   :project: TensorRT-LLM
+
 executor.h
 __________

@ -46,9 +46,9 @@ ______________________
 .. doxygenfile:: dataTransceiverState.h
   :project: TensorRT-LLM

-serialization.h
-_______________
+cacheCommunicator.h
+___________________

-.. doxygenfile:: serialization.h
+.. doxygenfile:: cacheCommunicator.h
   :project: TensorRT-LLM

--- a/latest/_sources/_cpp_gen/runtime.rst.txt
+++ b/latest/_sources/_cpp_gen/runtime.rst.txt
@ -4,40 +4,10 @@ Runtime
 .. Here are files in the cpp/include/runtime
 .. We manually add subsection to enable detailed description in the future
 .. It is also doable to automatically generate this file and list all the modules in the conf.py
-iTensor.h
-_________
+lookaheadBuffers.h
+__________________

-.. doxygenfile:: iTensor.h
-   :project: TensorRT-LLM
-
-cudaEvent.h
-___________
-
-.. doxygenfile:: cudaEvent.h
-   :project: TensorRT-LLM
-
-speculativeDecodingModule.h
-___________________________
-
-.. doxygenfile:: speculativeDecodingModule.h
-   :project: TensorRT-LLM
-
-common.h
-________
-
-.. doxygenfile:: common.h
-   :project: TensorRT-LLM
-
-samplingConfig.h
-________________
-
-.. doxygenfile:: samplingConfig.h
-   :project: TensorRT-LLM
-
-tllmLogger.h
-____________
-
-.. doxygenfile:: tllmLogger.h
+.. doxygenfile:: lookaheadBuffers.h
   :project: TensorRT-LLM

 lookaheadModule.h
@ -46,94 +16,28 @@ _________________
 .. doxygenfile:: lookaheadModule.h
   :project: TensorRT-LLM

+iBuffer.h
+_________
+
+.. doxygenfile:: iBuffer.h
+   :project: TensorRT-LLM
+
 modelConfig.h
 _____________

 .. doxygenfile:: modelConfig.h
   :project: TensorRT-LLM

-request.h
-_________
-
-.. doxygenfile:: request.h
-   :project: TensorRT-LLM
-
-iGptDecoderBatched.h
-____________________
-
-.. doxygenfile:: iGptDecoderBatched.h
-   :project: TensorRT-LLM
-
-cudaStream.h
-____________
-
-.. doxygenfile:: cudaStream.h
-   :project: TensorRT-LLM
-
-loraCache.h
-___________
-
-.. doxygenfile:: loraCache.h
-   :project: TensorRT-LLM
-
-medusaModule.h
-______________
-
-.. doxygenfile:: medusaModule.h
-   :project: TensorRT-LLM
-
-decoderState.h
-______________
-
-.. doxygenfile:: decoderState.h
-   :project: TensorRT-LLM
-
-lookaheadBuffers.h
-__________________
-
-.. doxygenfile:: lookaheadBuffers.h
-   :project: TensorRT-LLM
-
-eagleModule.h
-_____________
-
-.. doxygenfile:: eagleModule.h
-   :project: TensorRT-LLM
-
-runtimeDefaults.h
-_________________
-
-.. doxygenfile:: runtimeDefaults.h
-   :project: TensorRT-LLM
-
 decodingOutput.h
 ________________

 .. doxygenfile:: decodingOutput.h
   :project: TensorRT-LLM

-decodingInput.h
-_______________
+promptTuningParams.h
+____________________

-.. doxygenfile:: decodingInput.h
-   :project: TensorRT-LLM
-
-worldConfig.h
-_____________
-
-.. doxygenfile:: worldConfig.h
-   :project: TensorRT-LLM
-
-gptDecoderBatched.h
-___________________
-
-.. doxygenfile:: gptDecoderBatched.h
-   :project: TensorRT-LLM
-
-explicitDraftTokensBuffers.h
-____________________________
-
-.. doxygenfile:: explicitDraftTokensBuffers.h
+.. doxygenfile:: promptTuningParams.h
   :project: TensorRT-LLM

 bufferManager.h
@ -142,46 +46,22 @@ _______________
 .. doxygenfile:: bufferManager.h
   :project: TensorRT-LLM

-loraModule.h
-____________
-
-.. doxygenfile:: loraModule.h
-   :project: TensorRT-LLM
-
-eagleBuffers.h
-______________
-
-.. doxygenfile:: eagleBuffers.h
-   :project: TensorRT-LLM
-
-speculativeDecodingMode.h
-_________________________
-
-.. doxygenfile:: speculativeDecodingMode.h
-   :project: TensorRT-LLM
-
-promptTuningParams.h
-____________________
-
-.. doxygenfile:: promptTuningParams.h
-   :project: TensorRT-LLM
-
-gptDecoder.h
-____________
-
-.. doxygenfile:: gptDecoder.h
-   :project: TensorRT-LLM
-
-memoryCounters.h
-________________
-
-.. doxygenfile:: memoryCounters.h
-   :project: TensorRT-LLM
-
-ipcNvlsMemory.h
+gptJsonConfig.h
 _______________

-.. doxygenfile:: ipcNvlsMemory.h
+.. doxygenfile:: gptJsonConfig.h
+   :project: TensorRT-LLM
+
+runtimeDefaults.h
+_________________
+
+.. doxygenfile:: runtimeDefaults.h
+   :project: TensorRT-LLM
+
+loraCache.h
+___________
+
+.. doxygenfile:: loraCache.h
   :project: TensorRT-LLM

 rawEngine.h
@ -190,22 +70,46 @@ ___________
 .. doxygenfile:: rawEngine.h
   :project: TensorRT-LLM

-ipcUtils.h
-__________
+gptDecoder.h
+____________

-.. doxygenfile:: ipcUtils.h
+.. doxygenfile:: gptDecoder.h
   :project: TensorRT-LLM

-iBuffer.h
-_________
+eagleBuffers.h
+______________

-.. doxygenfile:: iBuffer.h
+.. doxygenfile:: eagleBuffers.h
   :project: TensorRT-LLM

-gptJsonConfig.h
+medusaModule.h
+______________
+
+.. doxygenfile:: medusaModule.h
+   :project: TensorRT-LLM
+
+virtualMemory.h
 _______________

-.. doxygenfile:: gptJsonConfig.h
+.. doxygenfile:: virtualMemory.h
+   :project: TensorRT-LLM
+
+explicitDraftTokensBuffers.h
+____________________________
+
+.. doxygenfile:: explicitDraftTokensBuffers.h
+   :project: TensorRT-LLM
+
+iTensor.h
+_________
+
+.. doxygenfile:: iTensor.h
+   :project: TensorRT-LLM
+
+common.h
+________
+
+.. doxygenfile:: common.h
   :project: TensorRT-LLM

 loraCachePageManagerConfig.h
@ -214,3 +118,105 @@ ____________________________
 .. doxygenfile:: loraCachePageManagerConfig.h
   :project: TensorRT-LLM

+worldConfig.h
+_____________
+
+.. doxygenfile:: worldConfig.h
+   :project: TensorRT-LLM
+
+loraModule.h
+____________
+
+.. doxygenfile:: loraModule.h
+   :project: TensorRT-LLM
+
+speculativeDecodingMode.h
+_________________________
+
+.. doxygenfile:: speculativeDecodingMode.h
+   :project: TensorRT-LLM
+
+cudaEvent.h
+___________
+
+.. doxygenfile:: cudaEvent.h
+   :project: TensorRT-LLM
+
+decodingInput.h
+_______________
+
+.. doxygenfile:: decodingInput.h
+   :project: TensorRT-LLM
+
+speculativeDecodingModule.h
+___________________________
+
+.. doxygenfile:: speculativeDecodingModule.h
+   :project: TensorRT-LLM
+
+iGptDecoderBatched.h
+____________________
+
+.. doxygenfile:: iGptDecoderBatched.h
+   :project: TensorRT-LLM
+
+eagleModule.h
+_____________
+
+.. doxygenfile:: eagleModule.h
+   :project: TensorRT-LLM
+
+tllmLogger.h
+____________
+
+.. doxygenfile:: tllmLogger.h
+   :project: TensorRT-LLM
+
+gptDecoderBatched.h
+___________________
+
+.. doxygenfile:: gptDecoderBatched.h
+   :project: TensorRT-LLM
+
+cudaStream.h
+____________
+
+.. doxygenfile:: cudaStream.h
+   :project: TensorRT-LLM
+
+ipcNvlsMemory.h
+_______________
+
+.. doxygenfile:: ipcNvlsMemory.h
+   :project: TensorRT-LLM
+
+samplingConfig.h
+________________
+
+.. doxygenfile:: samplingConfig.h
+   :project: TensorRT-LLM
+
+request.h
+_________
+
+.. doxygenfile:: request.h
+   :project: TensorRT-LLM
+
+decoderState.h
+______________
+
+.. doxygenfile:: decoderState.h
+   :project: TensorRT-LLM
+
+ipcUtils.h
+__________
+
+.. doxygenfile:: ipcUtils.h
+   :project: TensorRT-LLM
+
+memoryCounters.h
+________________
+
+.. doxygenfile:: memoryCounters.h
+   :project: TensorRT-LLM
+
--- a/latest/_sources/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md.txt
+++ b/latest/_sources/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md.txt
@ -503,7 +503,7 @@ Let's use some representative workloads to illustrate the performance impact wit
 </div>
 <p align="center"><sub><em>Figure 24: EP impact over MoE Group GEMM and EP communication</em></sub></p>
 In Figure 24, it can be observed that by increasing the EP size from 4 to 72, the MoE Group GEMM computation time gets reduced, while the EP communication time (for EP4/EP8 Reduce/Scatter is used, while for EP>8 All2All is used) stays almost constant.
-When the EP size increases from 18 to 32, the speed-up diminishes. We are working on optimizing it.
+When the EP size increases from 18 to 72, the speed-up diminishes. We are working on optimizing it.

 Next, let's use some representative workloads to understand the performance impact with EPLB.
 <div align="center">
@ -515,7 +515,7 @@ Next, let's use some representative workloads to understand the performance impa
 Clearly in Figure 25, we can see that EPLB brings a clear performance improvement when the EP size increases, for both MoE GroupGEMM and EP communication times.

 ## Reproducing steps
-Currently to run through the reproducing steps described in this section, please, use this [feature branch](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/tensorrt_llm). It will get merged to the main branch soon.
+The code and scripts required in the reproducing steps described in this section have been merged to the main branch.

 ### The effect of EP Load Balancer

@ -713,4 +713,5 @@ We believe the current implementation can be viewed as a reasonable E2E large-sc
 ## Acknowledgement

 The large-scale EP work is another great team effort, spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT-LLM.
+
 Through this collaborative endeavor, we have developed valuable insights to allow us improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community to better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.
--- a/latest/_sources/blogs/tech_blog/blog_7_NGram_performance_Analysis_And_Auto_Enablement.md.txt
+++ b/latest/_sources/blogs/tech_blog/blog_7_NGram_performance_Analysis_And_Auto_Enablement.md.txt
--- a/latest/_sources/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md.txt
+++ b/latest/_sources/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md.txt
@ -0,0 +1,322 @@
+# Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
+
+This blog post continues our previous work on [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), where we introduced the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT-LLM. Building upon that foundation, we have made significant performance improvements through various optimizations, achieving better throughput and latency for large-scale MoE models.
+
+*By NVIDIA TensorRT-LLM Team*
+
+## Table of Contents
+- [Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)](#scaling-expert-parallelism-in-tensorrt-llm-part-2-performance-status-and-optimization)
+  - [Table of Contents](#table-of-contents)
+  - [Optimization Highlights](#optimization-highlights)
+    - [Kernel Optimizations](#kernel-optimizations)
+      - [MoE Auxiliary Kernels](#moe-auxiliary-kernels)
+      - [Communication Kernels](#communication-kernels)
+    - [Expert Parallelism Load Balancer (EPLB)](#expert-parallelism-load-balancer-eplb)
+      - [Attempts at Online EPLB Implementation](#attempts-at-online-eplb-implementation)
+        - [1. Initial Approach for Weight Updating - cudaMemcpyAsync](#1-initial-approach-for-weight-updating---cudamemcpyasync)
+        - [2. Avoiding Deadlock - Multithreaded CPU Copy with Managed Memory](#2-avoiding-deadlock---multithreaded-cpu-copy-with-managed-memory)
+        - [3. NUMA Memory to Prevent Page Migration](#3-numa-memory-to-prevent-page-migration)
+        - [4. Addressing the TLB Thrashing Issue](#4-addressing-the-tlb-thrashing-issue)
+    - [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
+    - [Host Overhead Optimization](#host-overhead-optimization)
+      - [Reduce Binding and Inter-Process Communication Overhead](#reduce-binding-and-inter-process-communication-overhead)
+      - [Support Stream Interval](#support-stream-interval)
+  - [End-to-End Performance](#end-to-end-performance)
+  - [Future Work](#future-work)
+    - [Further Performance Optimization](#further-performance-optimization)
+  - [Acknowledgements](#acknowledgements)
+
+## Optimization Highlights
+
+Following the introduction of the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT-LLM in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), the TensorRT-LLM team has focused on optimizing the large EP implementation to improve performance.
+
+At the kernel level, we analyzed kernel duration and optimized performance by either improving existing kernels or developing new kernels that perform better. At the system level, we refined and optimized the EPLB implementation (which also helps reduce kernel scalability issues), integrated additional features such as MTP, and optimized host overhead to prevent Python code from slowing down inference.
+
+### Kernel Optimizations
+
+Our initial kernel breakdown and analysis revealed several key observations about performance impacts when Expert Parallelism (EP) scales up:
+
+1. **MoE GEMM duration decreases** as EP size increases, which is expected behavior.
+2. **Attention kernel performance** remains unaffected by increased EP size, demonstrating good scalability.
+3. **Communication and some MoE kernels** do not scale well and require optimization.
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_kernel_breakdown.png" width="1000">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 1: Kernel breakdown when scaling EP without EPLB.</em></sub></p>
+
+We have made improvements to the MoE auxiliary kernels, including `expandInputRowsKernel`, `doActivationKernel`, and `finalizeMoeRoutingKernel`, and to the communication kernels by replacing `AllGather` with a newly developed `AllToAllPrepare` kernel. Additionally, since the `ReduceScatter` and `AlltoAll` kernels do not scale well due to EP imbalance, we optimized the EPLB implementation to improve the scalability of those kernels.
+
+#### MoE Auxiliary Kernels
+
+We observed that given a fixed per-GPU batch size, `expandInputRowsKernel`, `doActivationKernel`, and `finalizeMoeRoutingKernel` showed increased execution time with larger EP size. However, their workload should remain constant regardless of EP size.
+
+Before MoE group GEMMs, `M` tokens are expanded to `M * topK` tokens, which are routed to experts hosted on different ranks. Hence, on average only `M * topK / EP` expanded tokens are valid on each rank (those routed to experts hosted on that rank). The original kernels launch a thread block for each expanded token. Each thread block detects if the token is valid; if so, it proceeds with the computation; otherwise, the thread block exits. For a large EP size, the valid tokens are sparse (`1 / EP`), so most thread blocks are launched for invalid tokens and do nothing, which is wasteful.
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_moe_aux_kernels1.png" width="400">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 2: Sparsity of valid expanded tokens. For DeepSeek-R1 deployed with EP 32, a batch of 12 tokens are expanded to 96 tokens, but only 3 are valid on rank 0.</em></sub></p>
+
+Therefore, we modified the kernels so that thread blocks are launched for valid tokens only. This addressed the scalability issue.
+
+Note that the number of valid tokens is data-dependent. To guarantee CUDA graph compatibility, we cannot rely on any data-dependent information on the host. Thus, we further modified the kernels to use persistent thread blocks, which control the loop based on the valid token number on the device.
+
+This optimization was implemented in [PR 5215](https://github.com/NVIDIA/TensorRT-LLM/pull/5215), with the following performance improvement:
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_moe_aux_kernels2.png">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 3: Optimization effect on MoE auxiliary kernels. (Left) Before optimization, kernel time increases with EP size. (Right) After optimization, kernel time remains constant with EP size.</em></sub></p>
+
+#### Communication Kernels
+
+As introduced in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#ep-communication-kernels-implementation), we developed EP communication kernels to transfer hidden state tensors of MoE. In the original design, each rank needs to determine which tokens it needs to send and receive, along with the expert IDs and scaling factors selected by those tokens. We initially used `allgather` to collect expert IDs and scaling factors, then each rank calculated the required metadata. However, we found that although the transmission size of this data is not large, the performance of `allgather` is unsatisfactory and may become a performance bottleneck when EP size increases. Therefore, we developed new communication kernels to optimize this process.
+
+First, a kernel counts the number of tokens needed to be transferred to another rank and transfers the count to that rank. Then each rank can calculate the index information for subsequent alltoall kernels. Finally, an alltoall kernel transfers expert IDs and scaling factors. These kernels make EP more scalable because the communication size no longer increases with EP size. The implementation of the communication part of these kernels is similar to the previous communication kernel of hidden states, are used in a FIFO manner. But an important difference is that these kernels use release-acquire instructions to ensure memory consistency, which has the advantage of being able to support various forms of data more flexibly. Although it is not as efficient as LL128 primitive in terms of performance, it is more helpful for fast iteration before the functionality converges.
+
+Note that although these kernels achieve better performance compared to `allgather`, there is still considerable room for optimization, especially in latency-bound scenarios.
+
+This optimization was implemented in [PR 5570](https://github.com/NVIDIA/TensorRT-LLM/pull/5570), with the following performance improvement:
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_communication_kernel.png">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 4: Optimization effect on communication kernels.</em></sub></p>
+
+### Expert Parallelism Load Balancer (EPLB)
+
+As introduced in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#ep-load-balancer), EP-level workload imbalance is common for large-scale EP inference across multiple datasets and has significant performance impacts. TensorRT-LLM implements a set of functionalities to address this issue. We have refined the code and improved the usability of this feature, and the benefits of EPLB are directly reflected in kernel duration improvements.
+
+The core challenge with EP scaling is that different experts receive varying amounts of work based on the routing decisions made by the MoE layer. This imbalance becomes more pronounced as EP size increases, leading to scenarios where some GPUs are heavily loaded while others remain underutilized. The Expert Parallelism Load Balancer (EPLB) addresses this by dynamically redistributing expert assignments to achieve better load balance across all participating GPUs.
+
+EPLB operates in two main modes:
+- **Static EPLB**: Pre-computed expert-to-GPU mappings based on historical data patterns
+- **Online EPLB**: Dynamic runtime redistribution that adapts to real-time workload patterns
+
+While Static EPLB provides good baseline improvements, Online EPLB offers the potential for optimal load balancing by responding to actual runtime patterns. However, implementing Online EPLB presented several unexpected technical challenges, particularly around weight synchronization and memory management in GPU clusters.
+
+In the previous [Kernel Optimizations](#kernel-optimizations) section, we noted that `reduce_scatter` and `alltoall` kernels do not show good scalability, with load imbalance being the major root cause. After applying proper EPLB strategy, those kernels perform well even when EP size scales to larger extents.
+
+#### Attempts at Online EPLB Implementation
+
+We discussed the [high-level design](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#high-level-design-introduction) and [implementation considerations](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#online-ep-load-balancer) of Online EPLB in our previous blog. However, several unexpected issues arose during implementation.
+
+These issues primarily stem from the weight updating mechanism.
+
+##### 1. Initial Approach for Weight Updating - cudaMemcpyAsync
+
+Our initial approach for weight updating was straightforward. Since GPU kernels from the model forward thread read weights, we placed weights directly in GPU memory using `cudaMalloc` and used a separate non-blocking stream to invoke multiple `cudaMemcpyAsync` calls for weight updates. After implementing the first version of the prototype, we discovered that with CUDA Graph enabled, the model forward thread and the weight updating thread could deadlock.
+
+After investigation, we found the root cause: both `cudaGraphLaunch` and `cudaMemcpyAsync` were competing for the same mutex inside CUDA. In our implementation with layer-wise weight updating, the GPU needs to synchronize with the CPU during model forward passes. This creates kernels that wait for CPU signals indicating that updates are complete and MoE weights are safe to use. These waiting kernels block subsequent kernels.
+
+Since LLM models contain numerous kernels, `cudaGraphLaunch` may need to wait for previous kernels to finish to acquire sufficient resources for launch completion. When waiting kernels are blocked by the CPU, `cudaGraphLaunch` is also blocked. The CPU thread responsible for unblocking this process is the weight update thread, which should signal completion when weight updating finishes. However, since our initial implementation used `cudaMemcpyAsync` for weight updating, it needed to acquire the CUDA mutex before starting memcpy operations. Unfortunately, this mutex was held by `cudaGraphLaunch` in the model forward thread, which was waiting for the weight updating thread to complete. This created a deadlock scenario.
+
+To resolve the deadlock, we needed to break the dependency cycle. While the model forward thread must depend on the weight updating thread for correctness, the weight updating process should not wait for `cudaGraphLaunch` in the model forward thread. Our solution was to use alternative methods instead of `cudaMemcpyAsync` to avoid competing for the same mutex with `cudaGraphLaunch` and other CUDA APIs.
+
+##### 2. Avoiding Deadlock - Multithreaded CPU Copy with Managed Memory
+
+Since weight updating is handled by CPU threads and we wanted to avoid interfering with GPU model forward passes while avoiding mutex contention in `cudaMemcpyAsync`, we chose to use CPU threads for copying operations. To achieve this, we needed MoE weights to be accessible by the CPU while remaining physically located on the GPU to provide high bandwidth for MoE forward passes.
+
+On GB200 systems, the C2C link between CPU and GPU allows CPU access to GPU memory, with GPU memory treated as NUMA nodes. Although the CUDA Driver API doesn't directly support this in CUDA 12.9, one option is to use `cudaMallocManaged` for MoE weights and use `cudaMemAdvise` to set the GPU as the preferred location while enabling CPU access. The CPU copy implementation was straightforward, but we still needed to detect system topology and bind to CPU cores belonging to the same NUMA nodes as the GPU's host NUMA node.
+
+After completing this implementation, CUDA Graph worked well with weight updating and we began seeing end-to-end performance benefits using Online EPLB in some configurations. However, we soon encountered issues with managed memory. Although the preferred location of managed memory was set to GPU, and on GB200 it typically remains on GPU when accessed by CPU, we still observed page migration when GPU memory usage approached capacity limits. The bottom half of the UVM interrupt service process for each GPU consumed 100% of one CPU core's time, causing severe slowdowns when approaching GPU memory limits. To address this, we needed GPU memory that was accessible by CPU without triggering page migration.
+
+##### 3. NUMA Memory to Prevent Page Migration
+
+On GB200 systems, the Grace CPU and Blackwell GPU are connected via C2C links, enabling mutual memory access. GPU memories are also exposed to the OS as NUMA nodes. Running `numactl -H` on GB200 nodes shows output similar to this:
+
+```text
+# numactl -H
+available: 34 nodes (0-33)
+node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
+node 0 size: 489935 MB
+node 0 free: 370318 MB
+node 1 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
+node 1 size: 489795 MB
+node 1 free: 465004 MB
+node 2 cpus:
+node 2 size: 188416 MB
+node 2 free: 188415 MB
+node 3 cpus:
+node 3 size: 0 MB
+node 3 free: 0 MB
+...
+node 9 cpus:
+node 9 size: 0 MB
+node 9 free: 0 MB
+node 10 cpus:
+node 10 size: 188416 MB
+node 10 free: 188416 MB
+...
+node 18 cpus:
+node 18 size: 188416 MB
+node 18 free: 188416 MB
+...
+node 26 cpus:
+node 26 size: 188416 MB
+node 26 free: 188416 MB
+...
+node distances:
+node   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33
+  0:  10  40  80  80  80  80  80  80  80  80  80  80  80  80  80  80  80  80  120  120  120  120  120  120  120  120  120  120  120  120  120  120  120  120
+  1:  40  10  120  120  120  120  120  120  120  120  120  120  120  120  120  120  120  120  80  80  80  80  80  80  80  80  80  80  80  80  80  80  80  80
+  2:  80  120  10  11  11  11  11  11  11  11  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40
+  3:  80  120  11  10  11  11  11  11  11  11  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40
+...
+  9:  80  120  11  11  11  11  11  11  11  10  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40  40
+...
+```
+
+In this configuration, `node 0` and `node 1` are Grace CPU nodes, each with 72 CPU cores and 480GB of memory. `node 2`, `node 10`, `node 18`, and `node 26` represent NVIDIA GB200 GPUs, which have no CPU cores but contain memory. Additional NUMA nodes (3-9, 11-17, 19-25, 27-33) are reserved for MIG instances and show 0 MB memory size. For brevity, we only show `node 3` and `node 9` in the example.
+
+It's possible to allocate system memory on a GPU's NUMA node using `numa_alloc_onnode` (e.g., NUMA node 2 for GPU 0), then register that memory with the GPU using `cudaHostRegister` to make it accessible as host system memory. This allows both CPU and GPU to access the memory, and our testing showed that bandwidth appears nearly identical to normal device memory from the GPU's perspective.
+
+This approach resolved page migration issues, and Online EPLB worked well for large batch sizes per GPU (e.g., 256). However, when investigating smaller batch sizes (32 or 64), we found that MoE GEMM kernel execution time could be higher than without Online EPLB—increasing from 75 µs to 93 µs for the first group GEMM of MoE with EP size 16. Further experiments revealed that when running group GEMM multiple times in the same layer, only the first execution suffered from this slowdown. By adding a warmup kernel that read only one value from 64 KB of weights, we found this simple warmup kernel consumed more than half the execution time of the group GEMM kernel. More interestingly, when running this warmup kernel in parallel with other kernels (using only 14 CTAs), those other kernels also became extremely slow. Based on these observations, we concluded that we were encountering TLB thrashing.
+
+##### 4. Addressing the TLB Thrashing Issue
+
+On GB200 systems, the default page size is 64 KB, which can be verified with:
+
+```text
+# getconf PAGE_SIZE
+65536
+```
+
+The `numa_alloc_onnode` function may use this page size, which is too small for efficient GPU kernel execution. Linux systems support [HugeTLB Pages](https://docs.kernel.org/admin-guide/mm/hugetlbpage.html), and on GB200 systems, the huge page size is 512 MB:
+
+```text
+# cat /proc/meminfo
+MemTotal:       1774995776 kB
+MemFree:        1651165696 kB
+MemAvailable:   1671517696 kB
+...
+HugePages_Total:       0
+HugePages_Free:        0
+HugePages_Rsvd:        0
+HugePages_Surp:        0
+Hugepagesize:     524288 kB
+Hugetlb:               0 kB
+```
+
+By using huge pages, we can significantly reduce the number of required TLB entries and avoid TLB thrashing. Our implementation approach:
+
+- Use `mmap` to allocate address space aligned to 512 MB boundaries
+- Use `mbind` to bind the memory to the GPU's NUMA node (e.g., NUMA node 2 for GPU 0)
+- Request huge pages using `madvise` with the `MADV_HUGEPAGE` flag
+- Register the memory with the GPU using `cudaHostRegister`
+
+This approach provides memory that is located on the GPU, accessible by the host, uses large pages instead of small ones, and doesn't trigger page migration. One consideration is that huge page allocation requires memory allocation at the granularity of one page (512 MB), which could cause significant memory waste with separate allocations. Since our primary use case involves MoE weights that are allocated at model load time and persist throughout the model's lifetime, we implemented a simple memory pool to minimize waste.
+
+Since our implementation relies on huge pages and `madvise`, Transparent Hugepages must be enabled on the system. Without this, you may encounter the exception `madvise(MADV_HUGEPAGE) failed.`. To verify that Transparent Hugepages is properly configured:
+
+```bash
+>$ cat /sys/kernel/mm/transparent_hugepage/enabled
+always [madvise] never
+>$ cat /sys/kernel/mm/transparent_hugepage/defrag
+always defer defer+madvise [madvise] never
+```
+
+In the output above, the value in square brackets indicates the current setting. If `never` is highlighted instead of `madvise`, you can enable Transparent HugePages with:
+
+```bash
+echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
+```
+
+After implementing huge pages, we found that warmup kernels now execute in only 4 µs without slowing down other kernels. Additionally, group GEMM kernel performance matches that achieved without Online EPLB, both with and without warmup operations. This optimization was implemented in [PR 5963](https://github.com/NVIDIA/TensorRT-LLM/pull/5963), and we achieved additional performance improvements using Online EPLB on the Pareto curve.
+
+### Multi-Token Prediction (MTP)
+
+MTP allows verifying and accepting several draft tokens in a single iteration, which is very beneficial for scenarios that prefer low latency. TensorRT-LLM has supported MTP, and we refer to our previous [MTP blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md#mtp-implementation-in-tensorrt-llm) for more details on the implementation.
+
+For large EP, we have also extended the implementation so that it works well with online EPLB. This was implemented in [PR 5213](https://github.com/NVIDIA/TensorRT-LLM/pull/5213).
+
+### Host Overhead Optimization
+
+Since large-scale EP enables extensive parallelism that includes both expert parallelism and attention data parallelism, the total batch size of one iteration scales with the number of total GPUs involved in the calculation. One outcome is that this significantly increases the number of requests and responses that the system must handle, putting huge pressure on Python threads. The Global Interpreter Lock (GIL) makes the situation worse, since multi-threading won't help under heavy system workloads. When the workload prefers higher throughput, it could even appear that highly optimized CUDA kernels are faster than CPU operation execution, and the GPU could be idle waiting for the CPU to finish the work.
+
+To address the increased host overhead when scaling parallelism in the system, we added optimizations to performance hot spots to reduce single-thread pressure.
+
+#### Reduce Binding and Inter-Process Communication Overhead
+
+TensorRT-LLM is designed to be composed of both C++ and Python code, so that C++ can handle the most performance-sensitive parts while Python handles higher-level logic. As we try to put more logic into Python to make the program easier to read and debug, there are still frequent conversations through binding interfaces between C++ and Python. Besides, since most of the logic is implemented in Python, there are several layers of implementation that communicate with each other through inter-process communication overhead. Frequent binding calls and serialization/deserialization introduced by inter-process communication slow down the core library.
+
+To improve program efficiency, we used environment variables introduced in the [performance analysis guidance](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-analysis.md) to measure and profile CPU overhead, and improved performance by reducing and reusing different binding calls as much as possible, and delaying Python object deserialization to avoid duplicated serialization and reduce message size when doing inter-process communication. This optimization was added in [PR 5224](https://github.com/NVIDIA/TensorRT-LLM/pull/5224). We have also reduced Python garbage collection (GC) impacts in [PR 5141](https://github.com/NVIDIA/TensorRT-LLM/pull/5141).
+
+To enable powerful NVTX markers for easier analysis of host overheads, TensorRT-LLM provides several useful environment variables:
+
+```bash
+export TLLM_NVTX_DEBUG=1 # enables more NVTX markers
+export TLLM_PROFILE_RECORD_GC=1 # enables GC collection hint
+export TLLM_PROFILE_START_STOP=100-150 # enable specific iterations profiling
+```
+
+#### Support Stream Interval
+
+As mentioned previously, one outcome of large-scale workloads is that they significantly increase the number of requests and responses that the system must handle, putting huge pressure on Python threads. When the GPU finishes one iteration of calculation, a batch of responses are generated under streaming mode. For each response, TensorRT-LLM must perform detokenization so that output IDs are converted to strings, and OpenAI API protocol objects need to be initialized so that responses can be returned to the user. This becomes time-consuming, especially when the number of responses is huge and the CPU must process them on each iteration. One observation from the user side will be reduced streaming performance when compared to non-streaming.
+
+To address this problem, TensorRT-LLM has supported a feature called stream interval. Instead of handling all responses on each iteration, a user-specified `stream_interval` `N` indicates that responses will be handled and returned every `N` iterations. This way, on each iteration, there will still be one output ID generated, but it won't be returned to users immediately (except for the first token for the sake of time-to-first-token latency). Instead, tokens accumulate for `N` iterations, and one response is created to handle those `N` generated tokens, which greatly reduces pressure on the CPU side by giving more time for the CPU to catch up. Meanwhile, users can still get streamed output.
+
+This feature was added in [PR 5284](https://github.com/NVIDIA/TensorRT-LLM/pull/5284), and we have verified that it works effectively to reduce host overhead. In most cases, setting `stream_interval` to 2 or 4 should close the gap (if any) between streaming and non-streaming modes. The feature can be enabled by setting the following in the YAML extra config file:
+
+```yaml
+stream_interval: 4
+```
+
+## End-to-End Performance
+
+To demonstrate the benefits of large-scale EP, we compared performance on EP16 and EP32 with EP4 and EP8 as baselines, on GB200 NVL72 using DeepSeek R1 FP4 [checkpoints](https://huggingface.co/nvidia/DeepSeek-R1-FP4).
+
+We explored different workloads including 1k-ISL 1k-OSL, 4k-ISL 1k-OSL, and 8k-ISL 1k-OSL. To quickly collect these data points and ensure that generation nodes are saturated, we used the `TLLM_BENCHMARK_REQ_QUEUES_SIZE` environment variable when benchmarking so that the workload can quickly reach a balanced point. The numbers are measured on commit `0cf2f6f154b4a5765d89945b20aa3449b2be7933` with a translation-task dataset, and generated by post-processing the per-iteration log.
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_perf-1k-1k-dep.png" width="800">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 5: DeepSeek R1 throughput on ISL/OSL 1k/1k.</em></sub></p>
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_perf-4k-1k-dep.png" width="800">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 6: DeepSeek R1 throughput on ISL/OSL 4k/1k.</em></sub></p>
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_perf-8k-1k-dep.png" width="800">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 7: DeepSeek R1 throughput on ISL/OSL 8k/1k.</em></sub></p>
+
+When enabling MTP, there is an extra performance boost compared to the baseline. We conducted end-to-end experiments and compared to EP4 and EP8 as baselines, seeing up to 6.17x per-GPU output throughput improvement. The numbers are measured with `trtllm-serve` enabling multiple features like large EP, disaggregated serving, EPLB, MTP, and using an OpenAI API client [tool](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) that sends requests to the server and collects performance metrics.
+
+<div align="center">
+<figure>
+  <img src="../media/tech_blog8_perf-8k-1k-e2e-mtp.png" width="800">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 8: DeepSeek R1 throughput on ISL/OSL 8k/1k with MTP enabled.</em></sub></p>
+
+To reproduce the numbers, refer to the [`examples/wide_ep/slurm_scripts`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/slurm_scripts) directory. The scripts there demonstrate how to launch TensorRT-LLM disaggregated serving with large-scale EP and other features enabled on a SLURM cluster.
+
+## Future Work
+
+### Further Performance Optimization
+
+We are planning to implement more performance optimizations for the large EP implementation, including optimizing the `concat_qkv` operation for the context phase, quantizing `Wo_GEMM` to FP4, supporting low-precision `All2All` operations, and fusing some `All2All` kernels into one. We will also explore integrating more features such as PDL.
+
+## Acknowledgements
+
+This work represents an outstanding example of collaborative engineering excellence within the TensorRT-LLM team. The successful implementation and optimization of large-scale Expert Parallelism required coordinated efforts across multiple domains - from low-level CUDA kernel optimizations to high-level system architecture design. The dedication and technical expertise demonstrated by our team members throughout this project has been truly remarkable.
+
+Large-scale Expert Parallelism represents one of the important workloads for users productive scenarios, enabling efficient deployment of large MoE models. The performance improvements achieved through this work demonstrate the transformative potential of expert parallelism at scale, and this work opens new possibilities for deploying increasingly sophisticated AI models in production environments.
--- a/latest/_sources/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md.txt
+++ b/latest/_sources/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md.txt
@ -0,0 +1,362 @@
+# Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+
+In the guide below, we will walk you through how to launch your own
+high-performance TensorRT-LLM server for **gpt-oss-120b** for inference.
+This guide covers both low-latency and max-throughput cases.
+
+The typical use case for **low-latency**, is when we try to maximize the number of tokens per second per user with a limited concurrency (4, 8 or 16 users).
+
+For **maximum throughput**, the goal is to maximize the amount of tokens produced per GPU per second. The former is an indication of how fast a system can produce tokens, the latter measures how many tokens a "chip" can generate per unit of time.
+
+
+## Prerequisites
+
+- 1x NVIDIA B200/GB200/H200 GPU (8x NVIDIA B200/H200 GPUs or 4x GB200 GPUs in a single node recommended for higher performance)
+- CUDA Toolkit 12.8 or later
+- Docker with NVIDIA Container Toolkit installed
+- Fast SSD storage for model weights
+- Access to the gpt-oss-120b model checkpoint
+
+We have a forthcoming guide for getting great performance on H100, however this guide focuses on the above GPUs.
+
+
+## Launching the TensorRT-LLM docker container
+
+The container image that you will use will be pulled from NVIDIA's NGC. This container is multi-platform and will run on both x64 and arm64 architectures: `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev`
+
+Run the follow docker command to start the TensorRT-LLM container in interactive mode:
+
+```bash
+docker run --rm --ipc=host -it \
+  --ulimit stack=67108864 \
+  --ulimit memlock=-1 \
+  --gpus all \
+  -p 8000:8000 \
+  -e TRTLLM_ENABLE_PDL=1 \
+  -e TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True \
+  -v ~/.cache:/root/.cache:rw \
+  nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev \
+  /bin/bash
+```
+
+
+This command:
+- Automatically removes the container when stopped (`--rm`)
+- Allows container to interact with the host's IPC resources and shared memory for optimal performance (`--ipc=host`)
+- Runs the container in interactive mode (`-it`)
+- Sets up shared memory and stack limits for optimal performance
+- Maps port 8000 from the container to your host
+- enables PDL for low-latency perf optimization
+- disables parallel weight loading
+
+Lastly the container mounts your user `.cache` directory to save the downloaded model checkpoints which are saved to `~/.cache/huggingface/hub/` by default. This prevents having to redownload the weights each time you rerun the container.
+
+
+## Running the TensorRT-LLM Server
+
+As pointed out in the introduction, this guide covers low-latency and max-throughput cases. Each requires a different configurations and commands to run. We will first cover the Low-Latency use-case, followed by the max throughput use-case.
+
+### Low-latency Use-Case
+
+#### Creating the Extra Options Configuration
+
+To run a server for low-latency workloads, create a YAML configuration file, `low_latency.yaml`, as follows:
+
+```yaml
+cat <<EOF > low_latency.yaml
+enable_attention_dp: false
+enable_mixed_sampler: true
+cuda_graph_config:
+    max_batch_size: 8
+    enable_padding: true
+moe_config:
+    backend: TRTLLM
+EOF
+```
+
+> Note: If you are using NVIDIA H200 GPUs it is highly recommended to set the `moe_config.backend` to TRITON to use the OpenAI Triton MoE kernel. See the section [(H200 Only) Using OpenAI Triton Kernels for MoE](#h200-only-using-openai-triton-kernels-for-moe) for more details.
+
+
+#### Launching TensorRT-LLM Serve
+
+To launch the TensorRT-LLM Server to serve the model with the **low latency** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU):
+
+<details open> <summary>1x B200/GB200/H200</summary>
+
+```bash
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+trtllm-serve  openai/gpt-oss-120b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --backend pytorch \
+  --tp_size 1 \
+  --ep_size 1 \
+  --trust_remote_code \
+  --extra_llm_api_options low_latency.yaml \
+  --kv_cache_free_gpu_memory_fraction 0.75
+```
+</details>
+
+<details> <summary>8x B200/H200</summary>
+
+```bash
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+trtllm-serve  openai/gpt-oss-120b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --backend pytorch \
+  --tp_size 8 \
+  --ep_size 8 \
+  --trust_remote_code \
+  --extra_llm_api_options low_latency.yaml \
+  --kv_cache_free_gpu_memory_fraction 0.75
+```
+</details>
+
+<details> <summary>4x GB200/B200/H200</summary>
+
+```bash
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+trtllm-serve  openai/gpt-oss-120b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --backend pytorch \
+  --tp_size 4 \
+  --ep_size 4 \
+  --trust_remote_code \
+  --extra_llm_api_options low_latency.yaml \
+  --kv_cache_free_gpu_memory_fraction 0.75
+```
+</details>
+
+
+
+
+### Max-Throughput Use-Case
+
+#### Creating the Extra Options Configuration
+
+To run a server for max-throughput workloads, create a YAML configuration file,
+`max_throughput.yaml`, as follows:
+
+```yaml
+cat <<EOF > max_throughput.yaml
+enable_attention_dp: true
+cuda_graph_config:
+    max_batch_size: 640
+    enable_padding: true
+stream_interval: 10
+moe_config:
+    backend: CUTLASS
+EOF
+```
+
+> Note: If you are using NVIDIA H200 GPUs it is highly recommended to set the `moe_config.backend` to TRITON to use the OpenAI Triton MoE kernel. See the section [(H200 Only) Using OpenAI Triton Kernels for MoE](#h200-only-using-openai-triton-kernels-for-moe) for more details.
+
+#### Launching TensorRT-LLM Serve
+
+To launch the TensorRT-LLM Server to serve the model with the **max throughput** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU):
+
+<details open> <summary>1x B200/GB200/H200</summary>
+
+```bash
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+trtllm-serve  openai/gpt-oss-120b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --backend pytorch \
+  --tp_size 1 \
+  --ep_size 1 \
+  --max_batch_size 640 \
+  --trust_remote_code \
+  --extra_llm_api_options max_throughput.yaml \
+  --kv_cache_free_gpu_memory_fraction 0.9
+```
+</details>
+
+<details> <summary>8x B200/H200</summary>
+
+```bash
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+trtllm-serve  openai/gpt-oss-120b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --backend pytorch \
+  --tp_size 8 \
+  --ep_size 8 \
+  --max_batch_size 640 \
+  --trust_remote_code \
+  --extra_llm_api_options max_throughput.yaml \
+  --kv_cache_free_gpu_memory_fraction 0.9
+```
+</details>
+
+<details> <summary>4x GB200/B200/H200</summary>
+
+```bash
+mpirun -n 1 --oversubscribe --allow-run-as-root \
+trtllm-serve  openai/gpt-oss-120b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --backend pytorch \
+  --tp_size 4 \
+  --ep_size 4 \
+  --max_batch_size 640 \
+  --trust_remote_code \
+  --extra_llm_api_options max_throughput.yaml \
+  --kv_cache_free_gpu_memory_fraction 0.9
+```
+</details>
+
+
+This command:
+- Maps port 8000 from the container to your host
+- Uses the PyTorch backend and specifies the tensor and expert parallel sizes
+- References the low latency or max throughput configuration file for extra options
+- Configures memory settings for optimal performance
+- Enables all GPUs with attention data parallelism for the max throughput scenario
+
+The initialization may take several minutes as it loads and optimizes the models.
+
+
+## (H200 Only) Using OpenAI Triton Kernels for MoE
+
+OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper based GPUs like NVIDIA's H200 for best performance. The NGC TensorRT-LLM container image mentioned above already includes the required kernels so you do not need to build or install them. It is highly recommended to enable them with the steps below:
+
+### Selecting Triton as the MoE backend
+
+To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--extra_llm_api_options`:
+
+```yaml
+moe_config:
+  backend: TRITON
+```
+
+Alternatively the TRITON backend can be enabled by passing the CLI flag to the trtllm-server command at runtime:
+
+```bash
+--moe_backend TRITON
+```
+
+
+## Test the Server with a Sample Request
+
+You can query the health/readiness of the server using
+
+```bash
+curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
+```
+
+When the `Status: 200` code is returned, the server is ready for queries. Note that the
+very first query may take longer due to initialization and compilation.
+
+Once the server is running, you can test it with a simple curl request:
+
+
+```bash
+curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+    "model": "openai/gpt-oss-120b",
+    "messages": [
+        {
+            "role": "user",
+            "content": "What is NVIDIAs advantage for inference?"
+        }
+    ],
+    "max_tokens": 1024,
+    "top_p": 0.9
+}' -w "\n"
+```
+
+<details><summary><b>Show Example Output</b></summary>
+
+```bash
+{
+  "id": "chatcmpl-c440e2a3e7e14cd699295afc3739bf42",
+  "object": "chat.completion",
+  "created": 1754358426,
+  "model": "openai/gpt-oss-120b",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "<|channel|>analysis<|message|>The user asks: \"What is NVIDIA's advantage for inference?\" The user wants:
+         \"What is NVIDIA's advantage for inference?\" Likely they want a detailed answer about why NVIDIA has advantages for
+          inference tasks (e.g., GPU acceleration, TensorRT, software stack, Tensor Cores, hardware, performance, low latency,
+          etc.). The user wants a short answer? Not specified. The user wants a straightforward answer. Probably a brief
+          explanation: hardware features like Tensor cores, optimized libraries (TensorRT), ecosystem, software stack,
+          integration with frameworks, cuDNN, inference GPU architecture, high throughput, low latency, FP16, INT8, etc.\n\nWe
+          need to produce a response: Provide NVIDIA's advantage for inference. Provide specifics: hardware (Tensor cores, RT
+          cores, Ampere architecture), software (TensorRT, CUDA, cuDNN, Triton Inference Server), developer ecosystem, network
+          effects, etc.\n\nAlso mention inference performance vs competition, DPUs, etc.\n\nWe'll answer concisely but thorough.
+          \n\nWe also must not mention we are ChatGPT or mention policy. Provide final answer.\n\nLet's produce a clear,
+          well-structured answer.\n\n<|end|><|start|>assistant<|channel|>final<|message|>**NVIDIA’s edge in AI inference comes
+          from a tightly‑integrated stack of hardware, software, and ecosystem tools that together deliver the
+          highest‑throughput, lowest‑latency execution of deep‑learning models.**\n\n| **Layer** | **Key Advantages** | **Why
+          It Matters for Inference** |\n|-----------|-------------------|---------------------------------|\n| **GPU
+          Architecture** | • **Tensor Cores** (FP16/FP8/INT8) – dedicated matrix‑multiply units that give 2–8× higher ops/W
+          than classic CUDA cores. <br>• **CUDA SMs with mixed‑precision support** – can run FP16/INT8/INT4 with minimal
+          accuracy loss. <br>• **NVLink / NVSwitch** – high‑bandwidth, low‑latency inter‑GPU communication for multi‑GPU
+          serving. | Maximizes raw arithmetic density, letting a single GPU handle many concurrent requests. |\n|
+          **Inference‑focused Silicon** | • **A100 / H100 “Tensor‑Core” GPUs** – FP8 support (H100) and sparsity acceleration
+          (up to 2× throughput). <br>• **T4/Ti, L40, L4, Jetson edge modules** – power‑optimized variants for data‑center,
+          edge, and robotics. | Provides the right performance‑per‑watt for cloud, on‑prem, or edge deployments. |\n|
+          **Software Stack** | • **TensorRT** – a compiler & runtime that fuses layers, applies precision calibration, and
+          auto‑tunes kernels for each GPU. <br>• **CUDA, cuDNN, cuBLAS** – low‑level libraries tuned for every generation of
+          GPU. <br>• **Triton Inference Server** – model‑agnostic serving, model‑versioning, batching, GPU sharing, and
+          scaling across CPUs/GPUs. | Turns raw GPU power into production‑ready, low‑latency services with minimal engineering
+          effort. |\n| **Model Optimizations** | • **Quantization (INT8/FP8) & Structured Sparsity** – supported natively by
+          TensorRT and the hardware. <br>• **Automatic Mixed‑Precision (AMP)** – retains accuracy while cutting compute. |
+          Reduces memory bandwidth and compute cost while keeping accuracy within acceptable bounds. |\n| **Ecosystem &
+          Compatibility** | • **Broad framework support** – TensorFlow, PyTorch, ONNX, JAX, etc., all compile to TensorRT.
+          <br>• **NVIDIA NGC** – pre‑optimized model zoo, containers, and reference pipelines. <br>• **MLOps tools** –
+          NVIDIA Merlin, Clara, Metropolis, etc., for recommendation, medical, vision pipelines. | Engineers can
+          plug‑and‑play, accelerate, and ship models faster. |\n| **Scalability & Deployment Flexibility** | • **DGX Cloud,
+          EGX, Jetson, and Orin** – end‑to‑end solutions from cloud to edge. <br>• **Multi‑Instance GPU (MIG)** – partition
+          a single A100 into up to 7 isolated inference instances. <br>• **NVIDIA AI Enterprise** – managed software suite
+          for on‑prem data‑centers. | Allows the same code to run on a laptop, an edge device, or a massive data‑center
+          cluster. |\n| **Performance Benchmarks** | • **Industry‑leading latency/throughput** on MLPerf Inference (FP8,
+          INT8). <br>• **Sparsity‑aware kernels** give >2× speedup on H100 with < 0.1 % accuracy loss. | Demonstrates
+          real‑world advantage in the most respected benchmark suite. |\n|",
+        "reasoning_content": null,
+        "tool_calls": []
+      },
+      "logprobs": null,
+      "finish_reason": "length",
+      "stop_reason": null,
+      "disaggregated_params": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 17,
+    "total_tokens": 1041,
+    "completion_tokens": 1024
+  },
+  "prompt_token_ids": null
+}
+
+```
+</details>
+
+The server exposes a standard OpenAI-compatible API endpoint that accepts JSON
+requests. You can adjust parameters like `max_tokens`, `temperature`, and
+others according to your needs.
+
+
+## Troubleshooting Tips
+
+- If you encounter CUDA out-of-memory errors, try reducing `max_batch_size`, `max_seq_len`, or `--kv_cache_free_gpu_memory_fraction`
+- Ensure your model checkpoints are compatible with the expected format
+- For performance issues, check GPU utilization with `nvidia-smi` while the server is running
+- If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed
+- For connection issues, make sure port 8000 is not being used by another application
+
+
+## Performance Tuning
+
+The configuration provided is optimized for 8xB200 GPUs, but you can adjust
+several parameters for your specific workload:
+
+- `max_batch_size`: Controls how many requests can be batched together
+- `max_draft_len`: The number of tokens Eagle can speculate ahead
+- `kv_cache_free_gpu_memory_fraction`: Controls memory allocation for the KV cache
--- a/latest/_sources/commands/trtllm-serve/index.rst.txt
+++ b/latest/_sources/commands/trtllm-serve/index.rst.txt
@ -0,0 +1,9 @@
+trtllm-serve
+=======================
+
+
+.. toctree::
+   :maxdepth: 1
+
+   trtllm-serve
+   run-benchmark-with-trtllm-serve
--- a/latest/_sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt
+++ b/latest/_sources/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md.txt
@ -0,0 +1,222 @@
+# Run benchmarking with `trtllm-serve`
+
+TensorRT-LLM provides the OpenAI-compatiable API via `trtllm-serve` command.
+A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).
+
+This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B:
+ * Methodology Introduction
+ * Launch the OpenAI-Compatibale Server with NGC container
+ * Run the performance benchmark
+ * Using `extra_llm_api_options`
+
+
+## Methodology Introduction
+
+The overall performance benchmarking involves:
+   1. Launch the OpenAI-compatible service with `trtllm-serve`
+   2. Run the benchmark with [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
+
+
+## Launch the NGC container
+
+TensorRT-LLM distributes the pre-built container on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags).
+
+You can launch the container using the following command:
+
+```bash
+docker run --rm --ipc host -p 8000:8000 --gpus all -it nvcr.io/nvidia/tensorrt-llm/release
+```
+
+## Start the trtllm-serve service
+> [!WARNING]
+> The commands and configurations presented in this document are for illustrative purposes only.
+> They serve as examples and may not deliver the optimal performance for your specific use case.
+> Users are encouraged to tune the parameters based on their hardware and workload.
+For benchmarking purposes, first create a bash script using the following code and name it start.sh.
+```bash
+#! /bin/bash
+model_path=/path/to/llama3.1_70B
+extra_llm_api_file=/tmp/extra-llm-api-config.yml
+
+cat << EOF > ${extra_llm_api_file}
+enable_attention_dp: false
+print_iter_log: true
+cuda_graph_config:
+  enable_padding: true
+  max_batch_size: 1024
+kv_cache_config:
+  dtype: fp8
+EOF
+
+trtllm-serve ${model_path} \
+    --max_batch_size 1024 \
+    --max_num_tokens 2048 \
+    --max_seq_len 1024 \
+    --kv_cache_free_gpu_memory_fraction 0.9 \
+    --tp_size 1 \
+    --ep_size 1 \
+    --trust_remote_code \
+    --extra_llm_api_options ${extra_llm_api_file}
+```
+> [!NOTE]
+> The trtllm-llmapi-launch is a script that launches the LLM-API code on
+> Slurm-like systems, and can support multi-node and multi-GPU setups.
+> e.g, trtllm-llmapi-launch trtllm-serve .....
+
+Run the start.sh script in the **background** with the following command:
+
+```bash
+bash -x start.sh &
+```
+
+Once the serving is set up, it will generate the output log as shown below.
+```bash
+INFO:     Started server process [80833]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://localhost:8000 (Press CTRL+C to quit)
+```
+
+## Run the benchmark
+
+Similar to starting trtllm-serve, create a script to execute the benchmark using the following code and name it bench.sh.
+
+```bash
+concurrency_list="1 2 4 8 16 32 64 128 256"
+multi_round=5
+isl=1024
+osl=1024
+result_dir=/tmp/llama3.1_output
+model_path=/path/to/llama3.1_70B
+
+for concurrency in ${concurrency_list}; do
+    num_prompts=$((concurrency * multi_round))
+    python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model ${model_path} \
+        --backend openai \
+        --dataset-name "random" \
+        --random-input-len ${isl} \
+        --random-output-len ${osl} \
+        --random-prefix-len 0 \
+        --num-prompts ${num_prompts} \
+        --max-concurrency ${concurrency} \
+        --ignore-eos \
+        --save-result \
+        --result-dir "${result_dir}" \
+        --result-filename "concurrency_${concurrency}.json" \
+        --percentile-metrics "ttft,tpot,itl,e2el"
+done
+```
+
+Then we can run the benchmark using the command below.
+
+```bash
+bash -x bench.sh &> output_bench.log
+```
+
+Below is some example TensorRT-LLM serving benchmark output. Your actual results may vary.
+
+```
+============ Serving Benchmark Result ============
+Successful requests:                     1
+Benchmark duration (s):                  1.64
+Total input tokens:                      1024
+Total generated tokens:                  1024
+Request throughput (req/s):              0.61
+Output token throughput (tok/s):         622.56
+Total Token throughput (tok/s):          1245.12
+User throughput (tok/s):                 623.08
+Mean Request AR:                         0.9980
+Median Request AR:                       0.9980
+---------------Time to First Token----------------
+Mean TTFT (ms):                          12.83
+Median TTFT (ms):                        12.83
+P99 TTFT (ms):                           12.83
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          1.59
+Median TPOT (ms):                        1.59
+P99 TPOT (ms):                           1.59
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           1.59
+Median ITL (ms):                         1.59
+P99 ITL (ms):                            1.77
+----------------End-to-end Latency----------------
+Mean E2EL (ms):                          1643.44
+Median E2EL (ms):                        1643.44
+P99 E2EL (ms):                           1643.44
+==================================================
+```
+
+### Key Metrics
+
+* Median Time to First Token (TTFT)
+  * The typical time elapsed from when a request is sent until the first output token is generated.
+* Median Time Per Output Token (TPOT)
+  * The typical time required to generate each token *after* the first one.
+* Median Inter-Token Latency (ITL)
+  * The typical time delay between the completion of one token and the completion of the next.
+* Median End-to-End Latency (E2EL)
+  * The typical total time from when a request is submitted until the final token of the response is received.
+* Total Token Throughput
+  * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
+
+## About `extra_llm_api_options`
+   trtllm-serve provides `extra_llm_api_options` knob to **overwrite** the parameters specified by trtllm-serve.
+   Generally, We create a YAML file that contains various performance switches.
+   e.g
+   ```yaml
+     cuda_graph_config:
+      padding_enabled: true
+     print_iter_log: true
+     kv_cache_dtype: fp8
+     enable_attention_dp: true
+   ```
+
+The following is a list of common performance switches.
+#### `kv_cache_config`
+
+&emsp;**Description**: A section for configuring the Key-Value (KV) cache.
+
+&emsp;**Options**:
+
+&emsp;&emsp;dtype: Sets the data type for the KV cache.
+
+&emsp;&emsp;**Default**: auto (uses the data type specified in the model checkpoint).
+
+#### `cuda_graph_config`
+
+&emsp;**Description**: A section for configuring CUDA graphs to optimize performance.
+
+&emsp;**Options**:
+
+&emsp;&emsp;enable\_padding: If true, input batches are padded to the nearest cuda\_graph\_batch\_size. This can significantly improve performance.
+
+&emsp;&emsp;**Default**: false
+
+&emsp;&emsp;max\_batch\_size: Sets the maximum batch size for which a CUDA graph will be created.
+
+&emsp;&emsp;**Default**: 0
+
+&emsp;&emsp;**Recommendation**: Set this to the same value as the \--max\_batch\_size command-line option.
+
+&emsp;&emsp;batch\_sizes: A specific list of batch sizes to create CUDA graphs for.
+
+&emsp;&emsp;**Default**: None
+
+#### `moe_config`
+
+&emsp;**Description**: Configuration for Mixture-of-Experts (MoE) models.
+
+&emsp;**Options**:
+
+&emsp;&emsp;backend: The backend to use for MoE operations.
+
+&emsp;&emsp;**Default**: CUTLASS
+
+#### `attention_backend`
+
+&emsp;**Description**: The backend to use for attention calculations.
+
+&emsp;**Default**: TRTLLM
+
+See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.`
--- a/latest/_sources/commands/trtllm-serve/trtllm-serve.rst.txt
+++ b/latest/_sources/commands/trtllm-serve/trtllm-serve.rst.txt
@ -175,26 +175,6 @@ TRT-LLM multimodal supports the following modalities and data types (depending o
     ]}


-Benchmark
---------
-
-You can use any benchmark clients compatible with OpenAI API to test serving performance of ``trtllm_serve``, we recommend ``genai-perf`` and here is a benchmarking recipe.
-
-First, install ``genai-perf`` with ``pip``:
-
-.. code-block:: bash
-
-   pip install genai-perf
-
-Then, :ref:`start a server<Starting a Server>` with ``trtllm-serve`` and ``TinyLlama-1.1B-Chat-v1.0``.
-
-Finally, test performance with the following command:
-
-.. literalinclude:: ../../../examples/serve/genai_perf_client.sh
-    :language: bash
-    :linenos:
-
-Refer to `README <https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/README.md>`_ of ``genai-perf`` for more guidance.

 Multi-node Serving with Slurm
 -----------------------------
@ -278,3 +258,6 @@ Syntax
 .. click:: tensorrt_llm.commands.serve:main
   :prog: trtllm-serve
   :nested: full
+
+Besides the above examples, `trtllm-serve` is also used as an entrypoint for performance benchmarking.
+Please refer to `Performance Benchmarking with `trtllm-serve` <https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/commands/trtllm-serve/trtllm-serve-bench.md>` for more details.
--- a/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
+++ b/latest/_sources/examples/openai_completion_client_json_schema.rst.txt
@ -5,6 +5,6 @@ Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/
 Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/serve/openai_completion_client_json_schema.py.

 .. literalinclude:: ../../../examples/serve/openai_completion_client_json_schema.py
-    :lines: 2-42
+    :lines: 2-52
    :language: python
    :linenos:
--- a/latest/_sources/index.rst.txt
+++ b/latest/_sources/index.rst.txt
@ -75,11 +75,11 @@ Welcome to TensorRT-LLM's Documentation!
 .. toctree::
   :maxdepth: 2
   :caption: Command-Line Reference
-   :hidden:
+   :name: Command-Line Reference

   commands/trtllm-bench
   commands/trtllm-build
-   commands/trtllm-serve
+   commands/trtllm-serve/index


 .. toctree::
--- a/latest/_sources/installation/linux.md.txt
+++ b/latest/_sources/installation/linux.md.txt
@ -9,14 +9,17 @@
   Before the pre-built Python wheel can be installed via `pip`, a few
   prerequisites must be put into place:

+   Install CUDA Toolkit following the [CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) and
+   make sure `CUDA_HOME` environment variable is properly set.
+
   ```bash
-   # Optional step: Only required for Blackwell and Grace Hopper
+   # Optional step: Only required for NVIDIA Blackwell GPUs and SBSA platform
   pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

   sudo apt-get -y install libopenmpi-dev
   ```

-   PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell and Grace Hopper GPUs. On prior GPUs, this extra installation is not required.
+   PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs and SBSA platform. On prior GPUs or Linux x86_64 platform, this extra installation is not required.

   ```{tip}
   Instead of manually installing the preqrequisites as described
@ -55,16 +58,3 @@ There are some known limitations when you pip install pre-built TensorRT-LLM whe
    when OMPI was not configured --with-slurm and we weren't able
    to discover a SLURM installation in the usual places.
    ```
-
-2. CUDA Toolkit
-
-    `pip install tensorrt-llm` won't install CUDA toolkit in your system, and the CUDA Toolkit is not required if want to just deploy a TensorRT-LLM engine.
-    TensorRT-LLM uses the [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/) to quantize a model, while the ModelOpt requires CUDA toolkit to jit compile certain kernels which is not included in the pytorch to do quantization effectively.
-    Please install CUDA toolkit when you see the following message when running ModelOpt quantization.
-
-    ```
-    /usr/local/lib/python3.10/dist-packages/modelopt/torch/utils/cpp_extension.py:65:
-    UserWarning: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
-    Unable to load extension modelopt_cuda_ext and falling back to CPU version.
-    ```
-    The installation of CUDA toolkit can be found in [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/).
--- a/latest/_sources/llm-api/reference.rst.txt
+++ b/latest/_sources/llm-api/reference.rst.txt
@ -272,7 +272,7 @@ API Reference
    :special-members: __init__
    :member-order: groupwise
    :inherited-members:
-    :exclude-members: model_dump_json,model_rebuild,parse_raw,model_config,model_parametrized_name,model_extra,model_post_init,schema,dict,model_copy,validate,model_dump,json,model_construct,model_computed_fields,model_validate_strings,model_validate,parse_obj,update_forward_refs,model_json_schema,model_validate_json,model_fields_set,construct,parse_file,from_orm,schema_json,copy,model_fields
+    :exclude-members: model_construct,model_post_init,model_extra,schema,construct,parse_raw,model_json_schema,model_computed_fields,update_forward_refs,schema_json,dict,parse_obj,parse_file,from_orm,model_fields,model_rebuild,model_validate_json,model_dump,model_fields_set,model_copy,model_parametrized_name,model_dump_json,model_validate_strings,copy,model_config,model_validate,validate,json

 .. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs
    :members:
@ -281,7 +281,7 @@ API Reference
    :special-members: __init__
    :member-order: groupwise
    :inherited-members:
-    :exclude-members: model_dump_json,model_rebuild,parse_raw,model_config,model_parametrized_name,model_extra,model_post_init,schema,dict,model_copy,validate,model_dump,json,model_construct,model_computed_fields,model_validate_strings,model_validate,parse_obj,update_forward_refs,model_json_schema,model_validate_json,model_fields_set,construct,parse_file,from_orm,schema_json,copy,model_fields
+    :exclude-members: model_construct,model_post_init,model_extra,schema,construct,parse_raw,model_json_schema,model_computed_fields,update_forward_refs,schema_json,dict,parse_obj,parse_file,from_orm,model_fields,model_rebuild,model_validate_json,model_dump,model_fields_set,model_copy,model_parametrized_name,model_dump_json,model_validate_strings,copy,model_config,model_validate,validate,json

 .. autoclass:: tensorrt_llm.llmapi.AutoDecodingConfig
    :members:
@ -291,3 +291,11 @@ API Reference
    :member-order: groupwise
    :inherited-members:

+.. autoclass:: tensorrt_llm.llmapi.AttentionDpConfig
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :special-members: __init__
+    :member-order: groupwise
+    :inherited-members:
+
--- a/latest/_sources/performance/perf-analysis.md.txt
+++ b/latest/_sources/performance/perf-analysis.md.txt
@ -52,7 +52,7 @@ Append “python-gil” to Nsys “-t” option.
 2. Set environment variable `TLLM_TORCH_PROFILE_TRACE=<path>`, and the results will be saved to `<path>`.

 ### Visualize the PyTorch profiler results
-Use [chrome://tracing/](chrome://tracing/) to inspect the saved profile.
+Use <chrome://tracing/> to inspect the saved profile.


 ## Examples
@ -88,4 +88,4 @@ TLLM_PROFILE_START_STOP=100-150 nsys profile \

 The Nsight Systems reports will be saved to `trace.nsys-rep`. Use NVIDIA Nsight Systems application to open it.

-The PyTorch profiler results will be saved to `trace.json`. Use [chrome://tracing/](chrome://tracing/) to inspect the saved profile.
+The PyTorch profiler results will be saved to `trace.json`. Use <chrome://tracing/> to inspect the saved profile.
--- a/latest/_sources/performance/perf-overview.md.txt
+++ b/latest/_sources/performance/perf-overview.md.txt
@ -12,6 +12,8 @@ Tuning batch sizes, parallelism configurations, and other options may lead to im

 For DeepSeek R1 performance, please check out our [performance guide](../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md)

+For more information on benchmarking with `trtllm-bench` see this NVIDIA [blog post](https://developer.nvidia.com/blog/llm-inference-benchmarking-performance-tuning-with-tensorrt-llm/).
+
 ## Throughput Measurements

 The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
@ -21,50 +23,64 @@ The performance numbers below were collected using the steps described in this d

 Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).

-### FP4 Models:
-```
+### Hardware
+The following GPU variants were used for testing:
+- H100 SXM 80GB (DGX H100)
+- H200 SXM 141GB (DGX H200)
+- GH200 96GB HBM3 (480GB LPDDR5X)
+- B200 180GB (DGX B200)
+- GB200 192GB (GB200 NVL72)
+
+Other hardware variants may have different TDP, memory bandwidth, core count, or other features leading to performance differences on these workloads.
+
+### FP4 Models
+
+```text
 nvidia/Llama-3.3-70B-Instruct-FP4
 nvidia/Llama-3.1-405B-Instruct-FP4
 ```

 #### Llama 3.3 70B FP4

-|                         | GPU     | B200      |           |           |           |
-|:------------------------|:--------|:----------|:----------|:----------|:----------|
-|                         | TP Size | 1         | 2         | 4         | 8         |
-| ISL, OSL                |         |           |           |           |           |
-|                         |         |           |           |           |           |
-| 128, 128                |         | 10,994.48 | 17,542.11 | 24,667.31 | 27,272.27 |
-| 128, 2048               |         | 9,580.46  | 15,432.35 | 23,568.12 | 31,174.31 |
-| 128, 4096               |         | 6,418.39  | 9,841.53  | 17,808.76 | 25,229.25 |
-| 500, 2000               |         | 7,343.32  | 11,850.57 | 20,709.67 | 28,038.78 |
-| 1000, 1000              |         | 6,752.53  | 10,815.88 | 16,413.04 | 20,060.66 |
-| 1000, 2000              |         | 6,670.07  | 9,830.73  | 15,597.49 | 20,672.37 |
-| 1024, 2048              |         | 6,636.75  | 9,807.13  | 15,519.23 | 20,617.28 |
-| 2048, 128               |         | 1,342.17  | 1,989.41  | 3,033.14  | 4,035.64  |
-| 5000, 500               |         | 1,429.67  | 2,419.67  | 3,686.84  | 5,182.96  |
-| 20000, 2000             |         | 629.77    | 1,177.01  | 2,120.66  | 3,429.03  |
+|                          | GPU:   | B200     | GB200  |
+|:-----------------------------|:---|:----------|:--------------|
+|    | TP Size   | 1      | 1          |
+| ISL, OSL |    |           |               |
+|                              |    |           |               |
+| 128, 128                     |    | 10,613.84 | 11,100.97     |
+| 128, 2048                    |    | 9,445.51  | 10,276.05     |
+| 128, 4096                    |    | 6,276.85  | 7,351.12      |
+| 500, 2000                    |    | 6,983.27  | 8,194.30      |
+| 1000, 1000                   |    | 6,434.29  | 7,401.80      |
+| 1000, 2000                   |    | 6,725.03  | 6,478.72      |
+| 1024, 2048                   |    | 6,546.61  | 7,922.88      |
+| 2048, 128                    |    | 1,330.35  | 1,418.47      |
+| 2048, 2048                   |    | 4,528.48  | 5,326.77      |
+| 5000, 500                    |    | 1,427.44  | 1,502.44      |
+| 20000, 2000                  |    | 636.36    | 732.43        |

 #### Llama 3.1 405B FP4

-|                         | GPU     | B200     |           |
-|:------------------------|:------- |:---------|:----------|
-|                         | TP Size | 4        | 8         |
-| ISL, OSL                |         |          |           |
-|                         |         |          |           |
-| 128, 128                |         | 6,163.81 | 9,002.90  |
-| 128, 2048               |         | 7,081.21 | 10,288.28 |
-| 128, 4096               |         | 6,028.37 | 8,713.77  |
-| 500, 2000               |         | 5,858.75 | 9,125.86  |
-| 1000, 1000              |         | 4,848.00 | 7,582.97  |
-| 1000, 2000              |         | 5,375.25 | 7,626.28  |
-| 1024, 2048              |         | 5,345.70 | 7,464.03  |
-| 2048, 128               |         | 693.55   | 1,086.56  |
-| 5000, 500               |         | 947.49   | 1,532.45  |
-| 20000, 2000             |         | 641.11   | 1,097.84  |
+|                         | GPU:    | B200    | GB200  |
+|:-----------------------------|:---|:---------|:--------------|
+|   | TP Size   | 4     | 4          |
+| ISL, OSL |    |          |               |
+|                              |    |          |               |
+| 128, 128                     |    | 6,218.89 | 6,598.97      |
+| 128, 2048                    |    | 7,178.10 | 7,497.40      |
+| 128, 4096                    |    | 5,890.89 | 5,898.19      |
+| 500, 2000                    |    | 5,844.37 | 6,198.33      |
+| 1000, 1000                   |    | 4,958.53 | 5,243.35      |
+| 1000, 2000                   |    | 4,874.16 | 4,905.51      |
+| 1024, 2048                   |    | 4,833.19 | 4,686.38      |
+| 2048, 128                    |    | 737.95   | 761.58        |
+| 2048, 2048                   |    | 4,024.02 | 4,326.56      |
+| 5000, 500                    |    | 1,032.40 | 1,078.87      |
+| 20000, 2000                  |    | 667.39   | 649.95        |

-### FP8 Models:
-```
+### FP8 Models
+
+```text
 nvidia/Llama-3.1-8B-Instruct-FP8
 nvidia/Llama-3.3-70B-Instruct-FP8
 nvidia/Llama-3.1-405B-Instruct-FP8
@ -73,61 +89,65 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8

 #### Llama 3.1 8B FP8

-|                         | GPU     | H200 141GB HBM3   | H100 80GB HBM3   |
-|:-----------------------------|:---|:------------------|:-----------------|
-|    | TP Size   | 1              | 1             |
-| ISL, OSL |    |                   |                  |
-|                              |    |                   |                  |
-| 128, 128                     |    | 27,970.14         | 27,688.36        |
-| 128, 2048                    |    | 23,326.38         | 21,841.15        |
-| 128, 4096                    |    | 17,508.51         | 13,730.89        |
-| 500, 2000                    |    | 21,390.41         | 17,833.34        |
-| 1000, 1000                   |    | 17,366.89         | 15,270.62        |
-| 1000, 2000                   |    | 16,831.31         | 13,798.08        |
-| 1024, 2048                   |    | 16,737.03         | 13,385.50        |
-| 2048, 128                    |    | 3,488.03          | 3,414.67         |
-| 5000, 500                    |    | 3,813.69          | 3,394.54         |
-| 20000, 2000                  |    | 1,696.66          | 1,345.42         |
+|                          | GPU:   | GH200  | H100   | H200   |
+|:-----------------------------|:---|:--------------|:-----------------|:------------------|
+|    | TP Size   | 1          | 1             | 1              |
+| ISL, OSL |    |               |                  |                   |
+|                              |    |               |                  |                   |
+| 128, 128                     |    | 27,304.25     | 26,401.48        | 27,027.80         |
+| 128, 2048                    |    | 24,045.60     | 21,413.21        | 23,102.25         |
+| 128, 4096                    |    | 15,409.85     | 13,541.54        | 17,396.83         |
+| 500, 2000                    |    | 20,123.88     | 17,571.01        | 19,759.16         |
+| 1000, 1000                   |    | 16,352.99     | 14,991.62        | 17,162.49         |
+| 1000, 2000                   |    | 15,705.82     | 13,505.23        | 16,227.11         |
+| 1024, 2048                   |    | 16,102.52     | 13,165.91        | 16,057.66         |
+| 2048, 128                    |    | 3,573.85      | 3,275.55         | 3,390.69          |
+| 2048, 2048                   |    | 10,767.05     | 9,462.43         | 11,822.14         |
+| 5000, 500                    |    | 3,584.74      | 3,276.47         | 3,758.08          |
+| 20000, 2000                  |    | 1,393.31      | 1,340.69         | 1,705.68          |

 #### Llama 3.3 70B FP8

-|                          | GPU    | H200 141GB HBM3   |          |           |           | H100 80GB HBM3   |          |           |           |
-|:-----------------------------|:---|:------------------|:---------|:----------|:----------|:-----------------|:---------|:----------|:----------|
-|    | TP Size   | 1              | 2     | 4      | 8      | 1            | 2     | 4      | 8      |
-| ISL, OSL |    |                   |          |           |           |                  |          |           |           |
-|                              |    |                   |          |           |           |                  |          |           |           |
-| 128, 128                     |    | 3,605.47          | 6,427.69 | 10,407.42 | 15,434.37 | 3,128.33         | 6,216.91 |           |           |
-| 128, 2048                    |    | 4,315.80          | 8,464.03 | 13,508.59 | 20,759.72 | 756.42           | 5,782.57 | 11,464.94 | 17,424.32 |
-| 128, 4096                    |    | 2,701.17          | 5,573.55 | 11,458.56 | 16,668.75 |                  | 3,868.37 | 8,206.39  | 12,624.61 |
-| 500, 2000                    |    | 3,478.76          | 6,740.06 | 12,200.18 |           |                  | 4,684.06 | 9,903.53  | 14,553.93 |
-| 1000, 1000                   |    | 2,744.32          | 5,119.72 | 8,685.44  | 12,744.51 | 742.14           | 4,247.19 | 7,435.65  | 11,018.81 |
-| 1000, 2000                   |    | 2,896.44          | 5,847.26 | 9,031.21  | 13,141.17 | 533.74           | 3,866.53 | 7,611.12  | 11,139.22 |
-| 1024, 2048                   |    | 2,874.18          | 5,568.61 | 8,946.71  | 13,082.62 | 530.16           | 3,796.68 | 7,575.24  | 11,004.31 |
-| 2048, 128                    |    | 435.90            | 772.67   | 1,264.76  |           |                  | 736.89   | 1,213.33  | 1,839.22  |
-| 2048, 2048                   |    |                   |          |           | 10,412.85 |                  |          |           |           |
-| 5000, 500                    |    | 545.96            | 997.15   | 1,698.22  | 2,655.28  | 204.94           | 862.91   | 1,552.68  | 2,369.84  |
-| 20000, 2000                  |    | 276.66            | 620.33   | 1,161.29  | 1,985.85  |                  | 416.13   | 903.66    | 1,554.10  |
+|                        | GPU:     | H100   | H200   |
+|:-----------------------------|:---|:-----------------|:------------------|
+|    | TP Size   | 2             | 2              |
+| ISL, OSL |    |                  |                   |
+|                              |    |                  |                   |
+| 128, 128                     |    | 6,092.28         | 6,327.98          |
+| 128, 2048                    |    | 5,892.94         | 7,467.36          |
+| 128, 4096                    |    | 3,828.46         | 5,526.42          |
+| 500, 2000                    |    | 4,654.74         | 6,639.15          |
+| 1000, 1000                   |    | 4,181.06         | 4,773.33          |
+| 1000, 2000                   |    | 3,708.93         | 5,790.36          |
+| 1024, 2048                   |    | 3,785.04         | 5,480.44          |
+| 2048, 128                    |    | 723.40           | 747.55            |
+| 2048, 2048                   |    | 2,785.53         | 3,775.80          |
+| 5000, 500                    |    | 865.55           | 978.28            |
+| 20000, 2000                  |    | 411.85           | 609.42            |

 #### Llama 3.1 405B FP8
-
-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
-|:-----------------------------|:---|:------------------|:-----------------|
-|   | TP Size   | 8              | 8             |
-| ISL, OSL |    |                   |                  |
-|                              |    |                   |                  |
-| 128, 2048                    |    | 5,567.87          |                  |
-| 128, 4096                    |    | 5,136.85          |                  |
-| 500, 2000                    |    | 4,787.61          | 3,673.91         |
-| 1000, 1000                   |    | 3,286.30          | 3,012.22         |
-| 1000, 2000                   |    | 3,636.76          | 3,262.20         |
-| 1024, 2048                   |    | 3,618.66          | 3,109.70         |
-| 2048, 128                    |    | 443.10            | 449.02           |
-| 5000, 500                    |    | 645.46            |                  |
-| 20000, 2000                  |    |                   | 372.12           |
+|                         | GPU:    | H100   | H200   |
+|:-----------------------------|:---|:-----------------|:------------------|
+|    | TP Size   | 8             | 8              |
+| Runtime Input/Output Lengths |    |                  |                   |
+|                              |    |                  |                   |
+| 128, 128                     |    |                  | 3,705.18          |
+| 128, 2048                    |    | 4,517.39         | 4,715.13          |
+| 128, 4096                    |    | 2,910.31         | 4,475.91          |
+| 500, 2000                    |    | 3,664.62         | 4,804.10          |
+| 1000, 1000                   |    | 2,955.50         | 3,208.25          |
+| 1000, 2000                   |    | 2,884.69         | 3,630.29          |
+| 1024, 2048                   |    | 3,237.41         | 3,609.50          |
+| 2048, 128                    |    | 433.47           | 441.35            |
+| 2048, 2048                   |    | 2,216.55         | 2,840.86          |
+| 5000, 500                    |    | 579.05           | 645.26            |
+| 20000, 2000                  |    | 363.27           | 509.87            |

 #### Llama 4 Maverick FP8

-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
+Note: Performance for Llama 4 on sequence lengths less than 8,192 tokens is affected by an issue introduced in v0.21. To reproduce the Llama 4 performance noted here, please use v0.20
+
+|                          | GPU    | H200   | H100   |
 |:-----------------------------|:---|:------------------|:-----------------|
 |    | TP Size    | 8              | 8             |
 | ISL, OSL |    |                   |                  |
@ -140,7 +160,6 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 | 2048, 128                    |    | 4,364.06          | 3,832.38         |
 | 2048, 2048                   |    | 12,800.89         |                  |
 | 5000, 500                    |    | 5,128.60          |                  |
-| 20000, 2000                  |    | 1,764.27          | 1,400.79         |

 ## Reproducing Benchmarked Results

@ -216,7 +235,7 @@ a model name (HuggingFace reference or path to a local model), a [generated data
 trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
 ```

-The data collected for the v0.20 benchmarks was run with the following file:
+The data collected for the v0.21 benchmarks was run with the following file:

 `llm_options.yml`
 ```yaml
@ -240,7 +259,7 @@ cuda_graph_config:
    - 8192
 ```

-In a majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue.
+In many cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` or lower if out-of-memory errors are encountered.

 The results will be printed to the terminal upon benchmark completion. For example,

--- a/latest/_sources/quick-start-guide.md.txt
+++ b/latest/_sources/quick-start-guide.md.txt
@ -19,8 +19,12 @@ Note: **This project will download and install additional third-party open sourc
 The following examples can most easily be executed using the prebuilt [Docker release container available on NGC](https://registry.ngc.nvidia.com/orgs/nvstaging/teams/tensorrt-llm/containers/release) (see also [release.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/release.md) on GitHub). Ensure to run these commands as a user with appropriate permissions, preferably `root`, to streamline the setup process.


-## LLM API
+## Launch Docker on a node with NVIDIA GPUs deployed.

+```bash
+docker run --ipc host --gpus all -it nvcr.io/nvidia/tensorrt-llm/release
+```
+## Run Offline inference with LLM API
 The LLM API is a Python API designed to facilitate setup and inference with TensorRT-LLM directly within Python. It enables model optimization by simply specifying a HuggingFace repository name or a model checkpoint. The LLM API streamlines the process by managing checkpoint conversion, engine building, engine loading, and model inference, all through a single Python object.

 Here is a simple example to show how to use the LLM API with TinyLlama.
@ -34,17 +38,29 @@ You can also directly load TensorRT Model Optimizer's [quantized checkpoints on
 To learn more about the LLM API, check out the [](llm-api/index) and [](examples/llm_api_examples).

 (deploy-with-trtllm-serve)=
-## Deploy with trtllm-serve
+## Deploy online serving with trtllm-serve

 You can use the `trtllm-serve` command to start an OpenAI compatible server to interact with a model.
-To start the server, you can run a command like the following example:
+To start the server, you can run a command like the following example inside a Docker container:

 ```bash
 trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 ```

-After the server starts, you can access familiar OpenAI endpoints such as `v1/chat/completions`.
-You can run inference such as the following example from another terminal:
+
+> [!NOTE]
+> If you are running `trtllm-server` inside a Docker container, you have two options for sending API requests:
+
+> 1. Expose port `8000` to access the server from outside the container.
+
+> 2. Open a new terminal and use the following command to directly attach to the running container:
+
+> ```bash
+> docker exec -it <container_id> bash
+> ```
+
+After the server has started, you can access well-known OpenAI endpoints such as `v1/chat/completions`.
+Inference can then be performed using examples similar to the one provided below, from a separate terminal.

 ```bash
 curl -X POST http://localhost:8000/v1/chat/completions \
@ -88,7 +104,7 @@ _Example Output_
 }
 ```

-For detailed examples and command syntax, refer to the [trtllm-serve](commands/trtllm-serve.rst) section. If you are running `trtllm-server` inside a Docker container, you have two options for sending API requests:
+For detailed examples and command syntax, refer to the [trtllm-serve](commands/trtllm-serve.rst) section.

 1. Expose port `8000` to access the server from outside the container.

@ -98,81 +114,12 @@ For detailed examples and command syntax, refer to the [trtllm-serve](commands/t
 docker exec -it <container_id> bash
 ```

-## Model Definition API
-
-### Prerequisites
-
- This quick start uses the Meta Llama 3.1 model. This model is subject to a particular [license](https://llama.meta.com/llama-downloads/). To download the model files, agree to the terms and [authenticate with Hugging Face](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct?clone=true).
-
- Complete the [installation](./installation/linux.md) steps.
-
- Pull the weights and tokenizer files for the chat-tuned variant of the Llama 3.1 8B model from the [Hugging Face Hub](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct).
-
-  ```console
-  git clone https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct
-  ```
-
-(quick-start-guide-compile)=
-### Compile the Model into a TensorRT Engine
-
-Use the [Llama model definition](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/llama) from the `examples/models/core/llama` directory of the GitHub repository.
-The model definition is a minimal example that shows some of the optimizations available in TensorRT-LLM.
-
-```console
-# From the root of the cloned repository, start the TensorRT-LLM container
-make -C docker ngc-release_run LOCAL_USER=1 IMAGE_TAG=x.y.z
-
-# Log in to huggingface-cli
-# You can get your token from huggingface.co/settings/token
-huggingface-cli login --token *****
-
-# Convert the model into TensorRT-LLM checkpoint format
-cd examples/models/core/llama
-pip install -r requirements.txt
-pip install --upgrade transformers # Llama 3.1 requires transformer 4.43.0+ version.
-python3 convert_checkpoint.py --model_dir Meta-Llama-3.1-8B-Instruct --output_dir llama-3.1-8b-ckpt
-
-# Compile model
-trtllm-build --checkpoint_dir llama-3.1-8b-ckpt \
-    --gemm_plugin float16 \
-    --output_dir ./llama-3.1-8b-engine
-```
-
-{{container_tag_admonition}}
-
-When you create a model definition with the TensorRT-LLM API, you build a graph of operations from [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) primitives that form the layers of your neural network. These operations map to specific kernels; prewritten programs for the GPU.
-
-In this example, we included the `gpt_attention` plugin, which implements a FlashAttention-like fused attention kernel, and the `gemm` plugin, that performs matrix multiplication with FP32 accumulation. We also called out the desired precision for the full model as FP16, matching the default precision of the weights that you downloaded from Hugging Face. For more information about plugins and quantizations, refer to the [Llama example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/llama) and {ref}`precision` section.
-
-### Run the Model
-
-Now that you have the model engine, run the engine and perform inference.
-
-```console
-python3 ../run.py --engine_dir ./llama-3.1-8b-engine  --max_output_len 100 --tokenizer_dir Meta-Llama-3.1-8B-Instruct --input_text "How do I count to nine in French?"
-```
-
-### Deploy with Triton Inference Server
-
-To create a production-ready deployment of your LLM, use the [Triton Inference Server backend for TensorRT-LLM](https://github.com/triton-inference-server/tensorrtllm_backend) to leverage the TensorRT-LLM C++ runtime for rapid inference execution and include optimizations like in-flight batching and paged KV caching. Triton Inference Server with the TensorRT-LLM backend is available as a [pre-built container through NVIDIA NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags).
-
-1. Clone the TensorRT-LLM backend repository:
-
-```console
-cd ..
-git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-cd tensorrtllm_backend
-```
-
-2. Refer to [End to end workflow to run llama 7b](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/llama.md) in the TensorRT-LLM backend repository to deploy the model with Triton Inference Server.
-
 ## Next Steps

 In this Quick Start Guide, you:

 - Saw an example of the LLM API
 - Learned about deploying a model with `trtllm-serve`
- Learned about the Model Definition API

 For more examples, refer to:

--- a/latest/_sources/reference/support-matrix.md.txt
+++ b/latest/_sources/reference/support-matrix.md.txt
@ -153,7 +153,7 @@ The following table shows the supported software for TensorRT-LLM.
 * -
  - Software Compatibility
 * - Container
-  - [25.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+  - [25.06](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
 * - TensorRT
  - [10.11](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
 * - Precision
--- a/latest/_sources/release-notes.md.txt
+++ b/latest/_sources/release-notes.md.txt
@ -73,6 +73,7 @@ All published functionality in the Release Notes has been fully tested and verif
 ### Known Issues
 - accuracy/test_cli_flow::TestGpt2::test_beam_search_large is broken.
 - Enabling disaggregated serving, MTP, and the overlap scheduler at the same time can lead to accuracy problems.
+- In 0.21, full chunked attention support has been added to make sure LLaMA4 model can functionally run with > 8K seq length, while there is a known performance regression(only affect LLaMA4 model) on Hopper due to this functional enhancement. The root cause of the regression has been identified already and the fix will be part of the future release.

 ## TensorRT-LLM Release 0.20.0

--- a/latest/_sources/torch/features/feature_combination_matrix.md.txt
+++ b/latest/_sources/torch/features/feature_combination_matrix.md.txt
@ -8,11 +8,11 @@
 | Disaggregated Serving      | Yes               | Yes        | Yes                        | ---                   |                 |          |                           |                           |               |                  |                |                        |                       |                 |
 | Chunked Prefill            | Yes               | Yes        | Yes                        | Untested              | ---             |          |                           |                           |               |                  |                |                        |                       |                 |
 | MTP                        | Yes               | Yes        | Yes                        | Yes                   | Untested        | ---      |                           |                           |               |                  |                |                        |                       |                 |
-| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | No                    | Untested        | No       | ---                       |                           |               |                  |                |                        |                       |                 |
-| EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | No                    | Untested        | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
+| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | No                    | Yes                   | No       | ---                       |                           |               |                  |                |                        |                       |                 |
+| EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | No                    | Yes                   | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
 | Torch Sampler              | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | ---           |                  |                |                        |                       |                 |
 | TLLM C++ Sampler           | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | No                        | No            | ---              |                |                        |                       |                 |
 | KV Cache Reuse             | Yes               | Yes        | Yes                        | Untested              | Yes             | Untested | Yes                       | No                        | Yes           | Yes              | ---            |                        |                       |                 |
 | Slide Window Attention     | Yes               | Yes        | Yes                        | Untested              | No                    | Untested | Untested                  | Untested                  | Yes           | Yes              | WIP            | ---                    |                       |                 |
-| Logits Post Processor      | No                | Yes        | Yes                        | No                    | Untested        | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
+| Logits Post Processor      | No                | Yes        | Yes                        | No                    | Yes            | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
 | Guided Decoding            | Yes               | Yes        | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |
--- a/latest/advanced/disaggregated-service.html
+++ b/latest/advanced/disaggregated-service.html
@ -59,7 +59,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -71,7 +71,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -391,7 +391,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -720,9 +724,9 @@ This feature is currently experimental, and the related API is subjected to chan
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/advanced/executor.html
+++ b/latest/advanced/executor.html
@ -59,7 +59,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -71,7 +71,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -391,7 +391,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -804,9 +808,9 @@ the TensorRT-LLM C++ Executor API.</p>
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/latest/advanced/expert-parallelism.html
+++ b/latest/advanced/expert-parallelism.html
@ -59,7 +59,7 @@
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc5';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.0.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>
@ -71,7 +71,7 @@

  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="1.0.0rc5" />
+  <meta name="docsearch:version" content="1.0.0rc6" />


  </head>
@ -391,7 +391,11 @@
 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>

 <li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
+</ul>
+</details></li>
 </ul>
 <p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
 <ul class="nav bd-sidenav">
@ -671,9 +675,9 @@
        <div class="footer-item">
 <div class="extra_footer">
  
-  <p>Last updated on August 01, 2025.</p>
+  <p>Last updated on August 06, 2025.</p>
  
-  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/fbee279">fbee279</a>.</p>
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/a16ba64">a16ba64</a>.</p>
  
 </div></div>
      
--- a/Show More
+++ b/Show More