From dfa11d810e72adac2d21073ae7644c007ce673cd Mon Sep 17 00:00:00 2001
From: Venky <23023424+venkywonka@users.noreply.github.com>
Date: Sat, 20 Dec 2025 00:18:43 +0530
Subject: [PATCH] [TRTC-102][docs] `--extra_llm_api_options`->`--config` in
 docs/examples/tests (#10005)

---
 .gitignore                                    |   2 +-
 .../note_sections.rst                         |  13 +-
 ...practice_on_DeepSeek-R1_in_TensorRT-LLM.md |  24 +-
 .../blogs/tech_blog/blog11_GPT_OSS_Eagle3.md  |   2 +-
 ..._R1_MTP_Implementation_and_Optimization.md |   8 +-
 ...ling_Expert_Parallelism_in_TensorRT-LLM.md |   8 +-
 .../blog6_Llama4_maverick_eagle_guide.md      |   2 +-
 .../blog9_Deploying_GPT_OSS_on_TRTLLM.md      |  14 +-
 docs/source/commands/trtllm-bench.rst         |  20 +-
 docs/source/commands/trtllm-eval.rst          |   4 +
 .../run-benchmark-with-trtllm-serve.md        |  66 ++--
 .../commands/trtllm-serve/trtllm-serve.rst    |  20 +-
 docs/source/deployment-guide/config_table.rst | 338 +++++++++---------
 ...loyment-guide-for-deepseek-r1-on-trtllm.md |   8 +-
 .../deployment-guide-for-gpt-oss-on-trtllm.md |   8 +-
 ...nt-guide-for-kimi-k2-thinking-on-trtllm.md |   2 +-
 ...oyment-guide-for-llama3.3-70b-on-trtllm.md |   6 +-
 ...oyment-guide-for-llama4-scout-on-trtllm.md |   6 +-
 ...ployment-guide-for-qwen3-next-on-trtllm.md |   8 +-
 .../deployment-guide-for-qwen3-on-trtllm.md   |  10 +-
 docs/source/deployment-guide/index.rst        |  22 +-
 .../developer-guide/perf-benchmarking.md      |  13 +-
 docs/source/developer-guide/perf-overview.md  |   6 +-
 .../benchmarking_with_trtllm_bench.md         |  10 +-
 docs/source/features/disagg-serving.md        |  22 +-
 docs/source/features/guided-decoding.md       |  12 +-
 docs/source/features/lora.md                  |  20 +-
 docs/source/features/parallel-strategy.md     |   2 +-
 docs/source/features/speculative-decoding.md  |   8 +-
 .../torch_compile_and_piecewise_cuda_graph.md |  78 ++--
 docs/source/helper.py                         |  11 +-
 .../legacy/performance/perf-benchmarking.md   |  12 +-
 .../benchmarking_with_trtllm_bench.md         |   4 +-
 .../advanced/serving_with_trtllm_serve.md     |   4 +-
 docs/source/torch/features/lora.md            |   8 +-
 examples/__init__.py                          |  14 +
 examples/configs/README.md                    |   2 +-
 examples/configs/__init__.py                  |  14 +
 examples/configs/database/__init__.py         |  14 +
 examples/disaggregated/README.md              |  36 +-
 .../slurm/benchmark/start_worker.sh           |   2 +-
 .../service_discovery_example/launch.slurm    |   8 +-
 .../slurm/simple_example/launch.slurm         |   4 +-
 examples/llm-api/llm_mgmn_trtllm_bench.sh     |   2 +-
 examples/models/core/deepseek_v3/README.md    |  52 +--
 examples/models/core/gemma/README.md          |   8 +-
 examples/models/core/gpt_oss/README.md        |   2 +-
 examples/models/core/kimi_k2/README.md        |   4 +-
 examples/models/core/llama/README.md          |   4 +-
 examples/models/core/llama4/README.md         |  12 +-
 .../models/core/mistral_large_3/README.md     |   2 +-
 examples/models/core/multimodal/README.md     |   2 +-
 .../models/core/nemotron/README_nano-v2-vl.md |   6 +-
 examples/models/core/phi/phi4-mm.md           |   4 +-
 examples/models/core/qwen/README.md           |  12 +-
 .../disaggregated/disagg_serving_local.sh     |   4 +-
 .../serve/deepseek_r1_reasoning_parser.sh     |   4 +-
 .../openai_completion_client_json_schema.py   |   2 +-
 examples/sparse_attention/RocketKV.md         |   6 +-
 examples/wide_ep/ep_load_balancer/README.md   |  12 +-
 scripts/generate_config_table.py              |  17 +-
 .../accuracy/test_disaggregated_serving.py    |   4 +-
 .../defs/disaggregated/test_auto_scaling.py   |   2 +-
 .../defs/disaggregated/test_disaggregated.py  |   4 +-
 .../disaggregated/test_disaggregated_etcd.py  |   4 +-
 .../defs/perf/README_release_test.md          |   4 +-
 tests/integration/defs/perf/test_perf.py      |  16 +-
 .../defs/stress_test/stress_test.py           |   2 +-
 tests/integration/defs/test_e2e.py            |   8 +-
 .../tools/test_config_database_sync.py        |  29 +-
 70 files changed, 625 insertions(+), 498 deletions(-)
 rename docs/source/{deployment-guide => _includes}/note_sections.rst (75%)
 create mode 100644 examples/__init__.py
 create mode 100644 examples/configs/__init__.py
 create mode 100644 examples/configs/database/__init__.py

diff --git a/.gitignore b/.gitignore
index 130ea9837b..7f7ffd18c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,7 +56,7 @@ tensorrt_llm/scripts
 docs/source/**/*.rst
 !docs/source/examples/index.rst
 !docs/source/deployment-guide/config_table.rst
-!docs/source/deployment-guide/note_sections.rst
+!docs/source/_includes/note_sections.rst
 *.swp
 
 # Testing
diff --git a/docs/source/deployment-guide/note_sections.rst b/docs/source/_includes/note_sections.rst
similarity index 75%
rename from docs/source/deployment-guide/note_sections.rst
rename to docs/source/_includes/note_sections.rst
index 4cd0d1c41d..d0b1657638 100644
--- a/docs/source/deployment-guide/note_sections.rst
+++ b/docs/source/_includes/note_sections.rst
@@ -1,11 +1,20 @@
 ..
-   Reusable note sections for deployment guides.
+   Reusable note sections for docs.
    Include specific notes using:
 
-   .. include:: note_sections.rst
+   .. include:: <path-to>/note_sections.rst
       :start-after: .. start-note-<name>
       :end-before: .. end-note-<name>
 
+.. start-note-config-flag-alias
+
+.. note::
+
+   **Non-breaking**: ``--config <file.yaml>`` is the preferred flag for passing a :ref:`YAML configuration file <configuring-with-yaml-files>`.
+   Existing workflows using ``--extra_llm_api_options <file.yaml>`` continue to work; it is an equivalent alias.
+
+.. end-note-config-flag-alias
+
 .. start-note-traffic-patterns
 
 .. note::
diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
index ad0e9975a1..7072f770bf 100644
--- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
@@ -139,7 +139,7 @@ To do the benchmark, run the following command:
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 moe_config:
   backend: TRTLLM
 speculative_config:
@@ -157,7 +157,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     --max_batch_size 1 \
     --tp 8 \
     --ep 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 Explanation:
@@ -168,7 +168,7 @@ Explanation:
 - `--max_batch_size`: Max batch size in each rank.
 - `--tp`: Tensor parallel size.
 - `--ep`: Expert parallel size.
-- `--extra_llm_api_options`: Used to specify some extra config. The content of the file is as follows:
+- `--config`: Used to specify extra YAML configuration. The content of the file is as follows:
 
 #### Expected Results
 The perf can be different when using different datasets and different machines.
@@ -195,7 +195,7 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
 
 #### Benchmark
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -218,7 +218,7 @@ trtllm-bench  --model nvidia/DeepSeek-R1-0528-FP4
      throughput
      --dataset ${YOUR_DATA_PATH}
      --tp 8  --ep 8
-     --extra_llm_api_options ./extra-llm-api-config.yml
+     --config ./config.yml
      --max_batch_size 896
      --max_num_tokens 2048
      --kv_cache_free_gpu_mem_fraction 0.93
@@ -261,7 +261,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
 
 YOUR_DATA_PATH=./dataset.txt
 
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -290,7 +290,7 @@ trtllm-bench -m nvidia/DeepSeek-R1-FP4 \
     --num_requests 49152 \
     --concurrency 3072 \
     --kv_cache_free_gpu_mem_fraction 0.85 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 #### Expected Result Format
@@ -315,7 +315,7 @@ To do the benchmark, run the following command:
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -329,7 +329,7 @@ trtllm-bench --model deepseek-ai/DeepSeek-R1 \
     --tp 8 \
     --ep 4 \
     --concurrency 1 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 #### Expected Result Format
@@ -363,7 +363,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
 
 YOUR_DATA_PATH=./dataset.txt
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 cuda_graph_config:
   batch_sizes:
   - 128
@@ -384,7 +384,7 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 \
     --num_requests 5120 \
     --concurrency 1024 \
     --kv_cache_free_gpu_mem_fraction 0.8 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 #### Expected Result Format
@@ -408,7 +408,7 @@ Average request latency (ms):                     181540.5739
 To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use the `trtllm-bench prepare-dataset` subcommand to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother.
 ### WIP: Enable more features by default
 
-Currently, there are some features that need to be enabled through a user-defined file `extra-llm-api-config.yml`, such as attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.
+Currently, there are some features that need to be enabled through a user-defined file `config.yml`, such as attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.
 
 Note that, `max_batch_size` and `max_num_tokens` can easily affect the performance. The default values for them are already carefully designed and should deliver good performance on overall cases, however, you may still need to tune it for peak performance.
 
diff --git a/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md b/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md
index 47ac67d24f..3b2ddfa782 100644
--- a/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md
+++ b/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md
@@ -105,7 +105,7 @@ Notes:
 Run the following command inside the container to start the endpoint:
 
 ```bash
-TRTLLM_ENABLE_PDL=1 trtllm-serve /config/models/gpt-oss-120b --host 0.0.0.0 --port 8000 --max_batch_size 10  --tp_size 8 --ep_size 4 --trust_remote_code --extra_llm_api_options /config/models/eagle/eagle.yaml --max_num_tokens 131072 --max_seq_len 131072
+TRTLLM_ENABLE_PDL=1 trtllm-serve /config/models/gpt-oss-120b --host 0.0.0.0 --port 8000 --max_batch_size 10  --tp_size 8 --ep_size 4 --trust_remote_code --config /config/models/eagle/eagle.yaml --max_num_tokens 131072 --max_seq_len 131072
 ```
 
 The server initializes, loads, and optimizes the models. After it is ready, it listens on port 8000.
diff --git a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
index 479e3db2c2..70318e6c20 100644
--- a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
+++ b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
@@ -122,7 +122,7 @@ To benchmark min-latency performance with MTP, you need to follow [this document
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 cuda_graph_config: {}
 moe_config:
   backend: TRTLLM
@@ -142,7 +142,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     --max_batch_size 1 \
     --tp 8 \
     --ep 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 ## MTP optimization - Relaxed Acceptance
@@ -178,7 +178,7 @@ To benchmark min-latency performance with MTP Relaxed Acceptance, you need to fo
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 cuda_graph_config: {}
 moe_config:
   backend: TRTLLM
@@ -201,7 +201,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     --max_batch_size 1 \
     --tp 8 \
     --ep 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 ## Evaluation
diff --git a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
index c0dbcea6bc..b887f6d024 100644
--- a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
+++ b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
@@ -541,7 +541,7 @@ Prepare a dataset following the [benchmarking documentation](https://github.com/
 Run 32-way expert parallelism inference on the prepared dataset. Please refer to the [LLM API MGMN example](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/llm_mgmn_trtllm_bench.sh) for details on running `trtllm-bench` on Slurm.
 
 ```bash
-cat > ./extra_llm_api_options.yaml <<EOF
+cat > ./config.yaml <<EOF
 enable_attention_dp: true
 EOF
 
@@ -551,7 +551,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 32 \
     --ep 32 \
-    --extra_llm_api_options ./extra_llm_api_options.yaml \
+    --config ./config.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --backend pytorch \
     --dataset ./dataset.json \
@@ -621,7 +621,7 @@ export EXPERT_STATISTIC_ITER_RANGE=100-200
 Run 36-way expert parallelism inference with the EPLB configuration incorporated:
 
 ```bash
-cat > ./extra_llm_api_options_eplb.yaml <<EOF
+cat > ./config_eplb.yaml <<EOF
 enable_attention_dp: true
 moe_config:
   load_balancer: ./moe_load_balancer.yaml
@@ -633,7 +633,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 36 \
     --ep 36 \
-    --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
+    --config ./config_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --backend pytorch \
     --dataset ./dataset.json \
diff --git a/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md b/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
index fb2bcdcee6..5ebb4e3cbb 100644
--- a/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
+++ b/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
@@ -73,7 +73,7 @@ docker run -d --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
         trtllm-serve /config/models/maverick \
             --host 0.0.0.0 --port 8000 \
             --tp_size 8 --ep_size 1 \
-            --trust_remote_code --extra_llm_api_options c.yaml \
+            --trust_remote_code --config c.yaml \
             --kv_cache_free_gpu_memory_fraction 0.75"
 ```
 
diff --git a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
index ffc7289ebb..28387081e3 100644
--- a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
+++ b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
@@ -86,7 +86,7 @@ trtllm-bench \
     --backend pytorch \
     --tp ${num_gpus} \
     --ep 1 \
-    --extra_llm_api_options low_latency.yaml \
+    --config low_latency.yaml \
     --dataset gpt-oss-120b-1k2k.txt \
     --max_batch_size ${max_batch_size} \
     --concurrency ${max_batch_size} \
@@ -149,7 +149,7 @@ trtllm-bench \
     --backend pytorch \
     --tp ${num_gpus} \
     --ep ${num_gpus} \
-    --extra_llm_api_options max_throughput.yaml \
+    --config max_throughput.yaml \
     --dataset gpt-oss-120b-1k2k.txt \
     --max_batch_size ${max_batch_size} \
     --concurrency $((max_batch_size * num_gpus)) \
@@ -171,7 +171,7 @@ Currently, the best throughput **19.5k tps/gpu** is achieved with DP4EP4 using 4
 
 ## Launch the TensorRT-LLM Server
 
-We can use `trtllm-serve` to serve the model by translating the benchmark commands above. For low-latency configuration, run:  
+We can use `trtllm-serve` to serve the model by translating the benchmark commands above. For low-latency configuration, run:
 **Note:** You can also point to a local path containing the model weights instead of the HF repo (e.g., `${local_model_path}`).
 
 ```bash
@@ -184,7 +184,7 @@ trtllm-serve  openai/gpt-oss-120b \
   --ep_size 8 \
   --max_batch_size 640 \
   --trust_remote_code \
-  --extra_llm_api_options max_throughput.yaml \
+  --config max_throughput.yaml \
   --kv_cache_free_gpu_memory_fraction 0.9
 ```
 </details>
@@ -201,7 +201,7 @@ trtllm-serve \
   --ep_size 4 \
   --max_batch_size 640 \
   --trust_remote_code \
-  --extra_llm_api_options max_throughput.yaml \
+  --config max_throughput.yaml \
   --kv_cache_free_gpu_memory_fraction 0.9
 ```
 </details>
@@ -223,7 +223,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT LLM
 
 ### Selecting Triton as the MoE backend
 
-To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--extra_llm_api_options`:
+To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--config`:
 
 ```yaml
 moe_config:
@@ -347,7 +347,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM
 
 ### Selecting Triton as the MoE backend
 
-To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--extra_llm_api_options`:
+To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--config`:
 
 ```yaml
 moe_config:
diff --git a/docs/source/commands/trtllm-bench.rst b/docs/source/commands/trtllm-bench.rst
index cd69874e0c..fee60a9ab7 100644
--- a/docs/source/commands/trtllm-bench.rst
+++ b/docs/source/commands/trtllm-bench.rst
@@ -3,9 +3,12 @@ trtllm-bench
 
 trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It provides three main subcommands for different benchmarking scenarios:
 
-**Common Options for All Commands:**
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
 
-**Usage:**
+Syntax
+------
 
 .. click:: tensorrt_llm.commands.bench:main
    :prog: trtllm-bench
@@ -14,8 +17,11 @@ trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It p
 
 
 
+Dataset preparation
+------------------
+
 prepare_dataset.py
-===========================
+^^^^^^^^^^^^^^^^^^
 
 trtllm-bench is designed to work with the `prepare_dataset.py <https://github.com/NVIDIA/TensorRT-LLM/blob/main/benchmarks/cpp/prepare_dataset.py>`_ script, which generates benchmark datasets in the required format. The prepare_dataset script supports:
 
@@ -38,7 +44,7 @@ trtllm-bench is designed to work with the `prepare_dataset.py <https://github.co
 **Usage:**
 
 prepare_dataset
--------------------
+"""""""""""""""
 
 .. code-block:: bash
 
@@ -72,7 +78,7 @@ prepare_dataset
      - Logging level: info or debug (default: info)
 
 dataset
--------------------
+"""""""
 
 Process real datasets from various sources.
 
@@ -103,7 +109,7 @@ Process real datasets from various sources.
 
 
 token_norm_dist
--------------------
+"""""""""""""""
 
 Generate synthetic datasets with normal token distribution.
 
@@ -134,7 +140,7 @@ Generate synthetic datasets with normal token distribution.
 
 
 token_unif_dist
--------------------
+"""""""""""""""
 
 Generate synthetic datasets with uniform token distribution
 
diff --git a/docs/source/commands/trtllm-eval.rst b/docs/source/commands/trtllm-eval.rst
index 1f4cf62c56..e00d9fe0dc 100644
--- a/docs/source/commands/trtllm-eval.rst
+++ b/docs/source/commands/trtllm-eval.rst
@@ -79,6 +79,10 @@ Alternatively, the ``--model`` argument also accepts a local path to pre-built T
 
 For more details, see ``trtllm-eval --help`` and ``trtllm-eval <task> --help``.
 
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+
 
 
 Syntax
diff --git a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
index 34a509f5a4..089426d9b7 100644
--- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
+++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
@@ -3,30 +3,11 @@
 TensorRT LLM provides the OpenAI-compatible API via `trtllm-serve` command.
 A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).
 
-This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B and Qwen2.5-VL-7B for multimodal models:
- * Methodology Introduction
- * Launch the OpenAI-Compatible Server with NGC container
- * Run the performance benchmark
- * Using `extra_llm_api_options`
- * Multimodal Serving and Benchmarking
-
-## Table of Contents
-- [Run benchmarking with `trtllm-serve`](#run-benchmarking-with-trtllm-serve)
-  - [Table of Contents](#table-of-contents)
-  - [Methodology Introduction](#methodology-introduction)
-  - [Preparation](#preparation)
-    - [Launch the NGC container](#launch-the-ngc-container)
-    - [Start the trtllm-serve service](#start-the-trtllm-serve-service)
-  - [Benchmark using `tensorrt_llm.serve.scripts.benchmark_serving`](#benchmark-using-tensorrt_llmservescriptsbenchmark_serving)
-    - [Key Metrics](#key-metrics)
-  - [About `extra_llm_api_options`](#about-extra_llm_api_options)
-      - [`kv_cache_config`](#kv_cache_config)
-      - [`cuda_graph_config`](#cuda_graph_config)
-      - [`moe_config`](#moe_config)
-      - [`attention_backend`](#attention_backend)
-  - [Multimodal Serving and Benchmarking](#multimodal-serving-and-benchmarking)
-    - [Setting up Multimodal Serving](#setting-up-multimodal-serving)
-    - [Multimodal Benchmarking](#multimodal-benchmarking)
+```{contents}
+:Contents
+:local:
+:depth: 3
+```
 
 
 ## Methodology Introduction
@@ -57,9 +38,9 @@ For benchmarking purposes, first create a bash script using the following code a
 ```bash
 #! /bin/bash
 model_path=/path/to/llama3.1_70B
-extra_llm_api_file=/tmp/extra-llm-api-config.yml
+config_file=/tmp/config.yml
 
-cat << EOF > ${extra_llm_api_file}
+cat << EOF > ${config_file}
 enable_attention_dp: false
 print_iter_log: true
 cuda_graph_config:
@@ -77,7 +58,7 @@ trtllm-serve ${model_path} \
     --tp_size 1 \
     --ep_size 1 \
     --trust_remote_code \
-    --extra_llm_api_options ${extra_llm_api_file}
+    --config ${config_file}
 ```
 > [!NOTE]
 > The trtllm-llmapi-launch is a script that launches the LLM-API code on
@@ -215,17 +196,24 @@ $$
 
 To get more detailed metrics besides the key metrics above, there is an [experimental tool](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/serve/scripts/time_breakdown) for request time breakdown.
 
-## About `extra_llm_api_options`
-   trtllm-serve provides `extra_llm_api_options` knob to **overwrite** the parameters specified by trtllm-serve.
-   Generally, We create a YAML file that contains various performance switches.
-   e.g
-   ```yaml
-     cuda_graph_config:
-      padding_enabled: true
-     print_iter_log: true
-     kv_cache_dtype: fp8
-     enable_attention_dp: true
-   ```
+## About `--config`
+
+```{eval-rst}
+.. include:: ../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+`trtllm-serve` provides `--config` to **overwrite** the parameters specified by `trtllm-serve`.
+Generally, we create a YAML file that contains various performance switches. For example:
+
+```yaml
+cuda_graph_config:
+  padding_enabled: true
+print_iter_log: true
+kv_cache_dtype: fp8
+enable_attention_dp: true
+```
 
 The following is a list of common performance switches.
 #### `kv_cache_config`
@@ -274,7 +262,7 @@ The following is a list of common performance switches.
 
 &emsp;**Default**: TRTLLM
 
-See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.`
+See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `--config`.
 
 ## Multimodal Serving and Benchmarking
 
diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst
index 33bad7f1e5..7e09872a9b 100644
--- a/docs/source/commands/trtllm-serve/trtllm-serve.rst
+++ b/docs/source/commands/trtllm-serve/trtllm-serve.rst
@@ -98,7 +98,7 @@ First, create a configuration file:
 
 .. code-block:: bash
 
-   cat >./extra-llm-api-config.yml<<EOF
+   cat >./config.yml<<EOF
    kv_cache_config:
        enable_block_reuse: false
    EOF
@@ -108,7 +108,7 @@ Then, start the server with the configuration file:
 .. code-block:: bash
 
    trtllm-serve Qwen/Qwen2-VL-7B-Instruct \
-       --extra_llm_api_options ./extra-llm-api-config.yml
+       --config ./config.yml
 
 Multimodal Chat API
 ~~~~~~~~~~~~~~~~~~~
@@ -201,7 +201,7 @@ You can deploy `DeepSeek-V3 <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_ m
 
 .. code-block:: bash
 
-    echo -e "enable_attention_dp: true\npytorch_backend_config:\n  enable_overlap_scheduler: true" > extra-llm-api-config.yml
+    echo -e "enable_attention_dp: true\npytorch_backend_config:\n  enable_overlap_scheduler: true" > config.yml
 
     srun -N 2 -w [NODES] \
         --output=benchmark_2node.log \
@@ -210,7 +210,7 @@ You can deploy `DeepSeek-V3 <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_ m
         --container-image=<CONTAINER_IMG> \
         --container-mounts=/workspace:/workspace \
         --container-workdir /workspace \
-        bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
+        bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --config ./config.yml"
 
 See `the source code <https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/trtllm-llmapi-launch>`_ of ``trtllm-llmapi-launch`` for more details.
 
@@ -234,11 +234,11 @@ For the default PyTorch backend, iteration statistics logging is enabled by sett
    # extra_llm_config.yaml
    enable_iter_perf_stats: true
 
-Start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file:
+Start the server and specify the ``--config`` argument with the path to the YAML file:
 
 .. code-block:: bash
 
-   trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --extra_llm_api_options extra_llm_config.yaml
+   trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --config config.yaml
 
 After sending at least one inference request to the server, you can fetch runtime iteration statistics by polling the ``/metrics`` endpoint.
 Since the statistics are stored in an internal queue and removed once retrieved, it's recommended to poll the endpoint shortly after each request and store the results if needed.
@@ -272,10 +272,16 @@ Example output:
         }
     ]
 
+.. _configuring-with-yaml-files:
+
 Configuring with YAML Files
 ----------------------------
 
-You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--extra_llm_api_options`` option to the path of a YAML file, the arguments in the file will override the corresponding command line arguments.
+You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--config`` option to the path of a YAML file. The arguments in the file override the corresponding command line arguments.
+
+.. include:: ../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
 
 The yaml file is configuration of `tensorrt_llm.llmapi.LlmArgs <https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs>`_, the class has multiple levels of hierarchy, to configure the top level arguments like ``max_batch_size``, the yaml file should be like:
 
diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst
index c2e1e5b55d..bb59b7505f 100644
--- a/docs/source/deployment-guide/config_table.rst
+++ b/docs/source/deployment-guide/config_table.rst
@@ -1,4 +1,4 @@
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-traffic-patterns
    :end-before: .. end-note-traffic-patterns
 
@@ -25,121 +25,121 @@
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml``
 
 .. end-deepseek-ai/DeepSeek-R1-0528
 
@@ -166,169 +166,169 @@
      - 1024 / 1024
      - 4
      - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 16
      - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - Balanced
      - 1024 / 1024
      - 32
      - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 128
      - `1k1k_tp4_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 256
      - `1k1k_tp4_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 16
      - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - Balanced
      - 8192 / 1024
      - 32
      - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 128
      - `8k1k_tp4_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 256
      - `8k1k_tp4_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 128
      - `1k1k_tp8_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 256
      - `1k1k_tp8_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - Balanced
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 128
      - `8k1k_tp8_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 256
      - `8k1k_tp8_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml``
 
 .. end-nvidia/DeepSeek-R1-0528-FP4-v2
 
@@ -355,720 +355,720 @@
      - 1024 / 1024
      - 4
      - `1k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml``
    * - B200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml``
    * - B200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml``
    * - B200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml``
    * - B200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml``
    * - B200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml``
    * - B200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml``
    * - B200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml``
    * - B200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml``
    * - 2xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml``
    * - 2xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml``
    * - 2xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml``
    * - 2xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml``
    * - 2xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml``
    * - 2xB200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml``
    * - 2xB200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml``
    * - 2xB200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml``
    * - 2xB200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml``
    * - 2xB200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml``
    * - 2xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml``
    * - 2xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml``
    * - 2xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml``
    * - 2xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml``
    * - 2xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml``
    * - H200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml``
    * - H200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml``
    * - H200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml``
    * - H200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml``
    * - H200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml``
    * - H200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml``
    * - H200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml``
    * - H200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml``
    * - H200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml``
    * - H200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml``
    * - H200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml``
    * - H200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml``
    * - H200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml``
    * - H200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml``
    * - H200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml``
    * - 2xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml``
    * - 2xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml``
    * - 2xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml``
    * - 2xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml``
    * - 2xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml``
    * - 2xH200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml``
    * - 2xH200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml``
    * - 2xH200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml``
    * - 2xH200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml``
    * - 2xH200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml``
    * - 2xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml``
    * - 2xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml``
    * - 2xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml``
    * - 2xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml``
    * - 2xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml``
    * - 4xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml``
    * - 4xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml``
    * - 4xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml``
    * - 4xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml``
    * - 4xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml``
    * - 4xH200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml``
    * - 4xH200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml``
    * - 4xH200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml``
    * - 4xH200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml``
    * - 4xH200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml``
    * - 4xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml``
    * - 4xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml``
    * - 4xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml``
    * - 4xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml``
    * - 4xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml``
 
 .. end-openai/gpt-oss-120b
diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md
index e4165eac09..881f86eb12 100644
--- a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md
@@ -115,7 +115,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -124,7 +124,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 
 #### `tensor_parallel_size`
@@ -200,7 +200,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
 
 * **Default**: `TRTLLM`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ### Wide Expert Parallelism
 
@@ -435,7 +435,7 @@ $$
 The following tables list recommended configurations from the comprehensive database for different performance profiles.
 
 ```{eval-rst}
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-traffic-patterns
    :end-before: .. end-note-traffic-patterns
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md
index 5a9f9f4c72..d28f3fa9f3 100644
--- a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md
@@ -113,7 +113,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -122,7 +122,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -178,7 +178,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
   * `backend`: The backend to use for MoE operations.
     **Default**: `CUTLASS`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
@@ -383,7 +383,7 @@ $$
 The following table lists recommended configurations from the comprehensive database for different performance profiles.
 
 ```{eval-rst}
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-traffic-patterns
    :end-before: .. end-note-traffic-patterns
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md
index 391a72091d..8ae2dac147 100644
--- a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md
@@ -60,7 +60,7 @@ With the `EXTRA_OPTIONS_YAML_FILE`, use the following example command to launch
 ```bash
 trtllm-serve nvidia/Kimi-K2-Thinking-NVFP4 \
     --host 0.0.0.0 --port 8000 \
-    --extra_llm_api_options ${EXTRA_OPTIONS_YAML_FILE}
+    --config ${EXTRA_OPTIONS_YAML_FILE}
 ```
 
 TensorRT LLM will load weights and select the best kernels during startup. The server is successfully launched when the following log is shown:
diff --git a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md
index d3e328d810..f58405e8be 100644
--- a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md
@@ -83,7 +83,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -92,7 +92,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -170,7 +170,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
 
 &emsp;**Default**: TRTLLM
 
-See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
+See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md
index 7d69b7a8be..d279ab3716 100644
--- a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md
@@ -82,7 +82,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -91,7 +91,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -166,7 +166,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
 
 * **Default**: `TRTLLM`
 
-See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
+See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md
index 46bf724b71..3ff4432d1b 100644
--- a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md
@@ -61,7 +61,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Qwen3-Next model from within the container.
 
 ```shell
-trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -70,7 +70,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -127,7 +127,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
   * `backend`: The backend to use for MoE operations.
     **Default**: `CUTLASS`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
@@ -220,7 +220,7 @@ If you want to save the results to a file add the following options.
 --result-filename "concurrency_${concurrency}.json"
 ```
 
-For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) 
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
 
 Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md
index 894c6a1e63..bda3e1a4c4 100644
--- a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md
@@ -66,7 +66,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Qwen3 model from within the container.
 
 ```shell
-trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -75,7 +75,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -127,10 +127,10 @@ These options provide control over TensorRT LLM's behavior and are set within th
 * **Options**:
 
   * `backend`: The backend to use for MoE operations.
-    
+
     **Default**: `CUTLASS`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
@@ -247,7 +247,7 @@ If you want to save the results to a file add the following options.
 --result-filename "concurrency_${concurrency}.json"
 ```
 
-For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) 
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
 
 Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
 
diff --git a/docs/source/deployment-guide/index.rst b/docs/source/deployment-guide/index.rst
index 644a9d9ae9..1d2df5e5b6 100644
--- a/docs/source/deployment-guide/index.rst
+++ b/docs/source/deployment-guide/index.rst
@@ -17,7 +17,7 @@ The TensorRT LLM Docker container makes these config files available at ``/app/t
 
    export TRTLLM_DIR="/app/tensorrt_llm" # path to the TensorRT LLM repo in your local environment
 
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-quick-start-isl-osl
    :end-before: .. end-note-quick-start-isl-osl
 
@@ -36,52 +36,52 @@ This table is designed to provide a straightforward starting point; for detailed
      - H100, H200
      - Max Throughput
      - `deepseek-r1-throughput.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-throughput.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
    * - `DeepSeek-R1 <https://huggingface.co/deepseek-ai/DeepSeek-R1-0528>`_
      - B200, GB200
      - Max Throughput
      - `deepseek-r1-deepgemm.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-deepgemm.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml``
    * - `DeepSeek-R1 (NVFP4) <https://huggingface.co/nvidia/DeepSeek-R1-FP4>`_
      - B200, GB200
      - Max Throughput
      - `deepseek-r1-throughput.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-throughput.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
    * - `DeepSeek-R1 (NVFP4) <https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2>`_
      - B200, GB200
      - Min Latency
      - `deepseek-r1-latency.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-latency.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml``
    * - `gpt-oss-120b <https://huggingface.co/openai/gpt-oss-120b>`_
      - Any
      - Max Throughput
      - `gpt-oss-120b-throughput.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/gpt-oss-120b-throughput.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml``
    * - `gpt-oss-120b <https://huggingface.co/openai/gpt-oss-120b>`_
      - Any
      - Min Latency
      - `gpt-oss-120b-latency.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/gpt-oss-120b-latency.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml``
    * - `Qwen3-Next-80B-A3B-Thinking <https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking>`_
      - Any
      - Max Throughput
      - `qwen3-next.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/qwen3-next.yaml>`_
-     - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml``
+     - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --config ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml``
    * - Qwen3 family (e.g. `Qwen3-30B-A3B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_)
      - Any
      - Max Throughput
      - `qwen3.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/qwen3.yaml>`_
-     - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed)
+     - ``trtllm-serve Qwen/Qwen3-30B-A3B --config ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed)
    * - `Llama-3.3-70B (FP8) <https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8>`_
      - Any
      - Max Throughput
      - `llama-3.3-70b.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/llama-3.3-70b.yaml>`_
-     - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml``
+     - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml``
    * - `Llama 4 Scout (FP8) <https://huggingface.co/nvidia/Llama-4-Scout-17B-16E-Instruct-FP8>`_
      - Any
      - Max Throughput
      - `llama-4-scout.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/llama-4-scout.yaml>`_
-     - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml``
+     - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml``
 
 Model-Specific Deployment Guides
 ---------------------------------
diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md
index ab6feab7e3..e95e28c496 100644
--- a/docs/source/developer-guide/perf-benchmarking.md
+++ b/docs/source/developer-guide/perf-benchmarking.md
@@ -2,6 +2,13 @@
 
 # TensorRT LLM Benchmarking
 
+
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
 TensorRT LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility that aims to make it
 easier for users to reproduce our officially published [performance overview](./perf-overview.md#throughput-measurements). `trtllm-bench` provides the follows:
 
@@ -176,7 +183,7 @@ trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synth
 
 To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. The `throughput` benchmark initializes the backend by tuning against the dataset provided via `--dataset` (or the other build mode settings described above).
 
-Note that CUDA graph is enabled by default. You can add additional pytorch config with `--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`.
+Note that CUDA graph is enabled by default. You can add additional pytorch config with `--config` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`.
 
 ```{tip}
 The command below specifies the `--model_path` option. The model path is optional and used only when you want to run a locally
@@ -289,7 +296,7 @@ The generated dataset will include LoRA request metadata. Below is an example of
 
 **LoRA Configuration**
 
-Create an `extra-llm-api-options.yaml` file with LoRA configuration:
+Create a `config.yaml` file with LoRA configuration:
 
 ```yaml
 lora_config:
@@ -314,7 +321,7 @@ trtllm-bench --model /path/to/base/model \
   throughput \
   --dataset synthetic_lora_data.json \
   --backend pytorch \
-  --extra_llm_api_options extra-llm-api-options.yaml
+  --config config.yaml
 ```
 
 ```{note}
diff --git a/docs/source/developer-guide/perf-overview.md b/docs/source/developer-guide/perf-overview.md
index ae3a0072e9..8602ff1896 100644
--- a/docs/source/developer-guide/perf-overview.md
+++ b/docs/source/developer-guide/perf-overview.md
@@ -269,7 +269,7 @@ Testing was performed using the PyTorch backend - this workflow does not require
 | Stage | Description | Command |
 | :- | - | - |
 | [Dataset](#preparing-a-dataset) | Create a synthetic dataset | `python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file` |
-| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options` |
+| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options` |
 
 ### Variables
 
@@ -323,7 +323,7 @@ a model name (HuggingFace reference or path to a local model), a [generated data
 
 For dense / non-MoE models:
 ```shell
-trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
+trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options
 ```
 Llama 3.3
 
@@ -337,7 +337,7 @@ cuda_graph_config:
 For MoE models:
 
 ```shell
-trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
+trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options
 ```
 
 GPT-OSS:
diff --git a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
index d5e0cde8f2..84f8015889 100644
--- a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
+++ b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
@@ -24,7 +24,13 @@ As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench
 
 ## Advanced Configuration
 
-For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file:
+For more granular control over AutoDeploy's behavior during benchmarking, use the `--config` flag with a YAML configuration file:
+
+```{eval-rst}
+.. include:: ../../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
 
 ```bash
 trtllm-bench \
@@ -32,7 +38,7 @@ trtllm-bench \
   throughput \
   --dataset /tmp/synthetic_128_128.txt \
   --backend _autodeploy \
-  --extra_llm_api_options autodeploy_config.yaml
+  --config autodeploy_config.yaml
 ```
 
 ### Configuration Examples
diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md
index ce52b9a3d5..b6eb4b17f9 100644
--- a/docs/source/features/disagg-serving.md
+++ b/docs/source/features/disagg-serving.md
@@ -1,4 +1,4 @@
-# Disaggregated Serving 
+# Disaggregated Serving
 
 - [Motivation](#Motivation)
 - [KV Cache Exchange](#KV-Cache-Exchange)
@@ -100,6 +100,12 @@ For more information on how to use Dynamo with TensorRT-LLM, please refer to [th
 
 The second approach to evaluate disaggregated LLM inference with TensorRT LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 6 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 6). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request.
 
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
 <div align="center">
 <figure>
   <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture3.png" width="800" height="auto">
@@ -126,19 +132,19 @@ For example, you could launch two context servers and one generation servers as
 
 ```
 
-# Generate context_extra-llm-api-config.yml
+# Generate context_config.yml
 # Overlap scheduler for context servers are disabled because it's not supported for disaggregated context servers yet
-echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml
+echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > context_config.yml
 
 # Start Context servers
-CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 &
-CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 &
+CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --config ./context_config.yml &> log_ctx_0 &
+CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --config ./context_config.yml &> log_ctx_1 &
 
-# Generate gen_extra-llm-api-config.yml
-echo -e "cache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml
+# Generate gen_config.yml
+echo -e "cache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > gen_config.yml
 
 # Start Generation servers
-CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --extra_llm_api_options ./gen_extra-llm-api-config.yml &> log_gen_0 &
+CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --config ./gen_config.yml &> log_gen_0 &
 ```
 Once the context and generation servers are launched, you can launch the disaggregated
 server, which will accept requests from clients and do the orchestration between context
diff --git a/docs/source/features/guided-decoding.md b/docs/source/features/guided-decoding.md
index 110efc8e51..3591d1808f 100644
--- a/docs/source/features/guided-decoding.md
+++ b/docs/source/features/guided-decoding.md
@@ -9,14 +9,20 @@ TensorRT LLM supports two grammar backends:
 
 ## Online API: `trtllm-serve`
 
-If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--extra_llm_api_options`. For example,
+If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--config`. For example,
+
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
 
 ```bash
-cat > extra_llm_api_options.yaml <<EOF
+cat > config.yaml <<EOF
 guided_decoding_backend: xgrammar
 EOF
 
-trtllm-serve nvidia/Llama-3.1-8B-Instruct-FP8 --extra_llm_api_options extra_llm_api_options.yaml
+trtllm-serve nvidia/Llama-3.1-8B-Instruct-FP8 --config config.yaml
 ```
 
 You should see a log like the following, which indicates the grammar backend is successfully enabled.
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index dae13c50a9..0525648225 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -157,7 +157,13 @@ llm = LLM(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -168,7 +174,7 @@ lora_config:
 ```bash
 python -m tensorrt_llm.commands.serve
      /path/to/model \
-    --extra_llm_api_options extra_llm_api_options.yaml
+    --config config.yaml
 ```
 
 ### Client Usage
@@ -196,7 +202,13 @@ response = client.completions.create(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -216,5 +228,5 @@ lora_config:
 ```
 ### Run trtllm-bench
 ```bash
-trtllm-bench --model $model_path throughput --dataset $dataset_path --extra_llm_api_options extra_llm_api_options.yaml --num_requests 64 --concurrency 16
+trtllm-bench --model $model_path throughput --dataset $dataset_path --config config.yaml --num_requests 64 --concurrency 16
 ```
diff --git a/docs/source/features/parallel-strategy.md b/docs/source/features/parallel-strategy.md
index 64b2b051be..caed59c367 100644
--- a/docs/source/features/parallel-strategy.md
+++ b/docs/source/features/parallel-strategy.md
@@ -80,7 +80,7 @@ enable_attention_dp: true
 EOF
 ```
 
-then set `--extra_llm_api_options parallel_config.yaml` in `trtllm-serve` or `trtllm-bench`.
+then set `--config parallel_config.yaml` in `trtllm-serve` or `trtllm-bench`.
 
 ### FFN Module
 
diff --git a/docs/source/features/speculative-decoding.md b/docs/source/features/speculative-decoding.md
index 48b70e6851..089d7ecf3a 100644
--- a/docs/source/features/speculative-decoding.md
+++ b/docs/source/features/speculative-decoding.md
@@ -122,7 +122,13 @@ llm = LLM("/path/to/target_model", speculative_config=speculative_config)
 
 ## Usage with `trtllm-bench` and `trtllm-serve`
 
-Speculative decoding options must be specified via `--extra_llm_api_options config.yaml` for both `trtllm-bench` and `trtllm-serve`. All speculative decoding options can be specified in this YAML file. An additional `decoding_type` option is used to specify the type of speculation to use. The available options are:
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+Speculative decoding options must be specified via `--config config.yaml` for both `trtllm-bench` and `trtllm-serve`. All speculative decoding options can be specified in this YAML file. An additional `decoding_type` option is used to specify the type of speculation to use. The available options are:
 
 * `MTP`
 * `Eagle` (for EAGLE 3)
diff --git a/docs/source/features/torch_compile_and_piecewise_cuda_graph.md b/docs/source/features/torch_compile_and_piecewise_cuda_graph.md
index 786ab39b51..5fab5e09d0 100644
--- a/docs/source/features/torch_compile_and_piecewise_cuda_graph.md
+++ b/docs/source/features/torch_compile_and_piecewise_cuda_graph.md
@@ -31,7 +31,13 @@ Piecewise CUDA Graph is a technique that runs cudagraph-unsupported components (
 
 ## Usage
 
-To enable torch.compile and Piecewise CUDA Graph, add the following configuration to `extra_config.yml`. Typically, the `extra_config.yml` can be used by adding launching args `--extra_llm_api_options extra_config.yml` to `trtllm-serve` or `trtllm-bench`.
+To enable torch.compile and Piecewise CUDA Graph, add the following configuration to `config.yml`. Typically, the `config.yml` can be used by adding launching args `--config config.yml` to `trtllm-serve` or `trtllm-bench`.
+
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
 
 ```yaml
 ... # Other extra config
@@ -50,7 +56,7 @@ Piecewise CUDA Graph only handles context-only and mixed context+generation iter
 ```yaml
 cuda_graph_config:
   enable_padding: true
-  max_batch_size: 1024 # Specify max capture batch size for generation only cuda graph. By default, TensorRT LLM will generate a capture list based on it. 
+  max_batch_size: 1024 # Specify max capture batch size for generation only cuda graph. By default, TensorRT LLM will generate a capture list based on it.
 
 torch_compile_config:
   capture_num_tokens: '${capture_num_tokens}' # Specify capture_num_tokens for piecewise cuda graph
@@ -72,7 +78,7 @@ Guidelines for `capture_num_tokens`:
 
 - Define bounds:
   - Lower bound: base it on typical context lengths. In low-latency workflows with KV-cache reuse, it can be as small as <10 tokens.
-  - Upper bound: set by hardware and model configuration—choose the largest token count that still provides a measurable benefit from Piecewise CUDA Graph even after padding. 
+  - Upper bound: set by hardware and model configuration—choose the largest token count that still provides a measurable benefit from Piecewise CUDA Graph even after padding.
 - Choose step size: Choose step sizes that balance coverage and memory overhead. Use denser steps in a smaller number of token ranges, and a fixed step (e.g., 256) for larger ranges.
 - Manage trade-offs: more capture points reduce padding but increase memory use and can lower max concurrency; fewer points save memory but increase padding and compute cost.
 
@@ -80,7 +86,7 @@ Even with Piecewise CUDA Graph enabled, you may still observe bubbles in the con
 
 ## Known Issue
 
-Torch compile cannot work with multi-ModelEngine config. 
+Torch compile cannot work with multi-ModelEngine config.
 
 1. Speculative Decoding in Two-Model Style
 
@@ -104,14 +110,14 @@ Currently, TRT-LLM mainly relies on torch.compile **fullgraph** mode to enable P
 
 #### Custom Op
 
-For ops that cannot be represented by a torch native op, developers need to wrap them into a custom op so that they can work properly with torch.compile. A custom op mainly contains two parts: Op forward implementation & Fake kernel. 
+For ops that cannot be represented by a torch native op, developers need to wrap them into a custom op so that they can work properly with torch.compile. A custom op mainly contains two parts: Op forward implementation & Fake kernel.
 
-1. Op forward implementation: Define how this op does forward calculation. Including custom CUDA kernel, etc. 
+1. Op forward implementation: Define how this op does forward calculation. Including custom CUDA kernel, etc.
 2. Fake kernel: Help torch.compile to do the output tensor dtype/shape inference.
 
-After wrapping the op into a torch custom op, the implementation is a completely **black box** for torch compile. Instead, torch.compile will fully rely on a fake kernel to do the tracing. 
+After wrapping the op into a torch custom op, the implementation is a completely **black box** for torch compile. Instead, torch.compile will fully rely on a fake kernel to do the tracing.
 
-Below is a simple example of flashinfer op’s fake kernel. 
+Below is a simple example of flashinfer op’s fake kernel.
 
 ```python
 @torch.library.custom_op("trtllm::flashinfer_silu_and_mul", mutates_args=())
@@ -127,9 +133,9 @@ For more examples, please refer to `tensorrt_llm/_torch/custom_ops`.
 
 #### Current Status
 
-For hot models like deepseek/qwen/lllama, we’ve already wrapped some large modules into a custom op to avoid trace failure/graph breaks and exclude output projection & MTP from torch.compile's scope. 
+For hot models like deepseek/qwen/lllama, we’ve already wrapped some large modules into a custom op to avoid trace failure/graph breaks and exclude output projection & MTP from torch.compile's scope.
 
-This means developing the inside attention custom op part, the MoE routed export part, and the MPT part don’t need to worry about complex torch.compile constraints since they are treated as a black box for Torch compile. Developers should only make sure the fake kernels of attention custom op, and routed expert are aligned with the actual implementation. 
+This means developing the inside attention custom op part, the MoE routed export part, and the MPT part don’t need to worry about complex torch.compile constraints since they are treated as a black box for Torch compile. Developers should only make sure the fake kernels of attention custom op, and routed expert are aligned with the actual implementation.
 
 
 <div align="center">
@@ -158,21 +164,21 @@ For the op outside of attention and MLP, the developer should obey the torch.com
 </div>
 <p align="center"><sub><em>Figure 2. TensorRT LLM Custom torch.compile Backend Overview</em></sub></p>
 
-Above is the overview of the TensorRT LLM custom backend for `torch.compile`. 
+Above is the overview of the TensorRT LLM custom backend for `torch.compile`.
 
 #### Torch IR Optimization
 
 Torch IR is the Fx graph that is directly traced by Torch Dynamo. It has several important features for us to do some graph rewriting and get information:
 
 1. Preserve the operations as is: We can easily find a specific operation and then transform it to arbitrary operations. No need to deal with `auto_functionalize`, etc.
-2. Preserve original variable tensor name in the Fx graph: For Piecewise CUDA Graph, it needs to find the correct `SymInt` which represents the token number. Hence, we rely on the `input_ids`'s shape to make it find the `SymInt` correctly. 
+2. Preserve original variable tensor name in the Fx graph: For Piecewise CUDA Graph, it needs to find the correct `SymInt` which represents the token number. Hence, we rely on the `input_ids`'s shape to make it find the `SymInt` correctly.
 
 #### ATen IR Optimization
 
 We get ATen IR after explicitly calling `aot_module_simplified` on the Fx graph. ATen IR is
 
 1. In SSA format (no input mutations)
-2. Strict subset of aten op (<250): In Torch IR, Python native add op, `torch.Tensor().add()`, `torch.aten.add.Tensor` could be three different ops. After the transform, they will be the same op. 
+2. Strict subset of aten op (<250): In Torch IR, Python native add op, `torch.Tensor().add()`, `torch.aten.add.Tensor` could be three different ops. After the transform, they will be the same op.
 3. Guaranteed metadata information, e.g., dtype and shape propagation
 
 On this IR level, TensorRT LLM will do the following optimization
@@ -183,16 +189,16 @@ All fusions are located in `tensorrt_llm/_torch/compilation/patterns` and implem
 
 1. Inadequate handling of scalars and lists:
    - Scalars get specialized into the traced pattern, forcing one pattern per value—impractical and non-general.
-   - Lists are flattened, turning elements into separate input arguments, making it impossible to match the original operation. 
+   - Lists are flattened, turning elements into separate input arguments, making it impossible to match the original operation.
 2. Trace-driven pitfalls: Because it’s trace-based, the generated source patterns may not meet our needs and can introduce additional issues as we expand pattern coverage.
 
 We mainly do the operation fusion for AllReduce & RMSNorm.
 
 1. AllReduce related fusion: Fuse the following operations into one AllReduce op.
    + AllReduce + Residual + RMSNorm
-   + AllReduce + Residual + RMSNorm + FP8 Quantization 
+   + AllReduce + Residual + RMSNorm + FP8 Quantization
    + AllReduce + Residual + RMSNorm + FP4 Quantization
-2. AllReduce with User Buffer: Converts AllReduce operations to use userbuffers to avoid extra copy overhead. 
+2. AllReduce with User Buffer: Converts AllReduce operations to use userbuffers to avoid extra copy overhead.
 
 We enable these fusions in torch.compile because they’re difficult to express in eager mode. For the AllReduce + RMSNorm fusion, which is cross-module, implementing it in eager mode would require moving code between modules, leading to redundant, complex, and hard-to-maintain logic.
 
@@ -204,7 +210,7 @@ Because ATen IR is SSA, in-place operations are rewritten as out-of-place via a
 
 ##### Auto Multi-stream
 
-Currently torch.compile won't create a subgraph for user user-defined CUDA stream. Instead, it will convert it to `set_stream`. The set_stream op doesn't have any consumers, so it will be removed in the Torch IR to ATen IR transformation, thus losing all the multi-stream scheduling. 
+Currently torch.compile won't create a subgraph for user user-defined CUDA stream. Instead, it will convert it to `set_stream`. The set_stream op doesn't have any consumers, so it will be removed in the Torch IR to ATen IR transformation, thus losing all the multi-stream scheduling.
 
 To address this, we implemented an auto multi-stream scheduler:
 
@@ -214,7 +220,7 @@ To address this, we implemented an auto multi-stream scheduler:
 
 3. Schedules nodes onto up to `max_num_streams` specified by user config
 
-4. Insert multi-stream related custom op: since the Fx graph executes operators in list order, so we insert streaming-control operators directly into the graph. Moreover, as these operators have no users, we cannot perform dead-code elimination after multi-stream scheduling. Below is an example of multi-stream, which `trtllm.dsv3_router_gemm_op.default` and `trtllm.silu_and_mul.default` + `trtllm.fp4_quantize.default` execute in parallel. 
+4. Insert multi-stream related custom op: since the Fx graph executes operators in list order, so we insert streaming-control operators directly into the graph. Moreover, as these operators have no users, we cannot perform dead-code elimination after multi-stream scheduling. Below is an example of multi-stream, which `trtllm.dsv3_router_gemm_op.default` and `trtllm.silu_and_mul.default` + `trtllm.fp4_quantize.default` execute in parallel.
 
    ```
    call_function  record_event                             trtllm.record_event                          (1,)                                                                                   {}
@@ -238,7 +244,7 @@ To address this, we implemented an auto multi-stream scheduler:
    call_function  record_stream_1                          trtllm.record_stream                         (mm_1, 1)                                                                              {}
    call_function  record_event_4                           trtllm.record_event                          (2,)                                                                                   {}
    call_function  set_stream_1                             trtllm.set_stream                            (0,)                                                                                   {}
-   call_function  wait_event_2                             trtllm.wait_event                            (2,)                        
+   call_function  wait_event_2                             trtllm.wait_event                            (2,)
    ```
 
 #### Piecewise CUDA Graph
@@ -254,14 +260,14 @@ In the current design, we assume the attention block is the only non-capturable
 
 Notes:
 
-1. Attention **MUST NOT** have any output. The output tensor should be allocated by CUDA Graph. 
-2. Each sub-cudagraph **MUST** have at least one input tensor that contains the number of tokens in the shape. 
-3. Only allow dynamic shape for `num_of_tokens` dim. 
+1. Attention **MUST NOT** have any output. The output tensor should be allocated by CUDA Graph.
+2. Each sub-cudagraph **MUST** have at least one input tensor that contains the number of tokens in the shape.
+3. Only allow dynamic shape for `num_of_tokens` dim.
 
 ### Common Trace Failure
 
 1. Custom op fake kernel: For every custom op, developers must implement a correct fake kernel. **Make sure to update the corresponding fake kernel when the custom op is changed**
-2. Dynamic Iteration Number Loop: This is technically not a trace failure, but it will introduce long-time tracing that is generally not acceptable. When torch.compile tries to convert PyTorch modeling code to Fx graph, it will try to unroll the loop. For a loop that has a large and dynamic loop number with a large loop body, the tracing process will take a long time to do the unrolling. 
+2. Dynamic Iteration Number Loop: This is technically not a trace failure, but it will introduce long-time tracing that is generally not acceptable. When torch.compile tries to convert PyTorch modeling code to Fx graph, it will try to unroll the loop. For a loop that has a large and dynamic loop number with a large loop body, the tracing process will take a long time to do the unrolling.
    1. If the IO of the loop can be easily written into a custom op format, try to replace it with a custom op
    2. If the loop num is unchanged during the whole inference service lifetime, then it is ok to leave the loop as is. (e.g., Model decoder layer loop)
 
@@ -276,30 +282,30 @@ Notes:
      + `torch.nonzeros()`: Produce data-dependent dynamic shape tensor
      + `torch.sym_min`: `SymInt` aware min
      + `torch.Tensor.tolist()`, `torch.Tensor.item()`
-     + **Solution:** Use them inside a custom op if these operators don't get involved in producing the custom op's output tensor. 
+     + **Solution:** Use them inside a custom op if these operators don't get involved in producing the custom op's output tensor.
 
-2. Use a custom object’s method: For a class like mapping config, we cannot directly use its method like has_pp() in the model forward. 
+2. Use a custom object’s method: For a class like mapping config, we cannot directly use its method like has_pp() in the model forward.
 
-   + **Solution**: We should convert it to a bool in the model init and use the bool. 
+   + **Solution**: We should convert it to a bool in the model init and use the bool.
 
    ```python
    class Mapping(object):
        def __init__(self, ...):
            ...
-         
+
        def has_pp(self): # Cannot use this method in torch.compile
            return self.pp_size > 1
    ```
 
 3. Data Dependent Control(DDC) flow involved in code
 
-   + **Solution**: Try to avoid DDC in the code. Try to pre-compute the result outside of torch.compile's scope. For the following example, try to pre-compute the `torch.sum(data)` at the data preparation stage, and pass the result to the `forward`. 
+   + **Solution**: Try to avoid DDC in the code. Try to pre-compute the result outside of torch.compile's scope. For the following example, try to pre-compute the `torch.sum(data)` at the data preparation stage, and pass the result to the `forward`.
 
    ```python
    class TestCase(torch.nn.Module):
        def __init__(self):
            super().__init__()
-   
+
     def forward(self, x, data):
         y = x ** 2
         if torch.sum(data) >= 4: # Data Dependent Control Here!
@@ -308,7 +314,7 @@ Notes:
             t = y / 2
         t = t + 10
         return t
-   
+
    test_case = TestCase()
    test_case = torch.compile(test_case, backend=Backend())
    x = torch.randn(5).cuda()
@@ -320,15 +326,15 @@ Notes:
 
 ### Recompilation
 
-1. Try not to use data-dependent dynamic shapes in the model forward. (e.g., slice the tensor based on input value). This will introduce 0/1 specialization to the model and will possibly introduce recompile. 
+1. Try not to use data-dependent dynamic shapes in the model forward. (e.g., slice the tensor based on input value). This will introduce 0/1 specialization to the model and will possibly introduce recompile.
 
    1. **0/1 specialization**: torch.compile will recompile the model if a dynamic tensor’s dim equals 0 or 1. In the worst case, it will recompile 3 times for 1 dimension: 0,1, >2
 
-2. For an int argument that would change during runtime, use `SymInt` rather than int in the C++ custom op definition. Otherwise, it will trigger a recompile when the value changes. 
+2. For an int argument that would change during runtime, use `SymInt` rather than int in the C++ custom op definition. Otherwise, it will trigger a recompile when the value changes.
 
    ```c++
    TORCH_LIBRARY_FRAGMENT(trtllm, m)
-   {    
+   {
        m.def("allgather(Tensor input, SymInt[]? sizes, int[] group) -> Tensor");
        m.def("allgather_list(Tensor[] input_list, SymInt[]? sizes, int[] group) -> Tensor[]");
    }
@@ -340,13 +346,13 @@ Notes:
 
    2. Control Flow based on dynamic shape
 
-   3. Next power of two: Previously, we used `bit_length()` to implement the next power of 2 function. However, it will cause a recompile for every int value. Now rewrite the code to be torch.compile-friendly. 
+   3. Next power of two: Previously, we used `bit_length()` to implement the next power of 2 function. However, it will cause a recompile for every int value. Now rewrite the code to be torch.compile-friendly.
 
       ```python
       def next_positive_power_of_2(x: int) -> int:
           if x < 1:
               return 1
-      
+
           # Following code is equivalent to 1 << (x - 1).bit_length()
           # But this impl does not contain bit_length(), so it can be used by torch compile.
           # It can correctly handle 64-bit numbers, which should be enough for now.
@@ -359,5 +365,3 @@ Notes:
           n |= n >> 32
           return n + 1
       ```
-
-      
diff --git a/docs/source/helper.py b/docs/source/helper.py
index 675bd697e9..9f6530e166 100644
--- a/docs/source/helper.py
+++ b/docs/source/helper.py
@@ -358,15 +358,20 @@ def update_version():
     docs_source_dir = Path(__file__).parent.resolve()
     md_files = list(docs_source_dir.rglob("*.md"))
 
+    # Default is to replace `release:x.y.z` placeholders; set to 0 to disable.
+    if os.environ.get("TRTLLM_DOCS_REPLACE_CONTAINER_TAG", "1") != "1":
+        return
+
     for file_path in md_files:
         with open(file_path, "r") as f:
             content = f.read()
-        content = content.replace(
+        updated = content.replace(
             "nvcr.io/nvidia/tensorrt-llm/release:x.y.z",
             f"nvcr.io/nvidia/tensorrt-llm/release:{version}",
         )
-        with open(file_path, "w") as f:
-            f.write(content)
+        if updated != content:
+            with open(file_path, "w") as f:
+                f.write(updated)
 
 
 if __name__ == "__main__":
diff --git a/docs/source/legacy/performance/perf-benchmarking.md b/docs/source/legacy/performance/perf-benchmarking.md
index 9530b6da1b..caca11a7a4 100644
--- a/docs/source/legacy/performance/perf-benchmarking.md
+++ b/docs/source/legacy/performance/perf-benchmarking.md
@@ -415,11 +415,17 @@ Total Latency (ms):             13525.6862
 
 ### Running with the PyTorch Workflow
 
+```{eval-rst}
+.. include:: ../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
 To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. With the PyTorch flow, you will not need to
 run `trtllm-bench build`; the `throughput` benchmark initializes the backend by tuning against the
 dataset provided via `--dataset` (or the other build mode settings described [above](#other-build-modes)).
 Note that CUDA graph is enabled by default. You can add additional pytorch config with
-`--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the
+`--config` followed by the path to a YAML file. For more details, please refer to the
 help text by running the command with `--help`.
 
 ```{tip}
@@ -511,7 +517,7 @@ The generated dataset will include LoRA request metadata. Below is an example of
 
 **LoRA Configuration**
 
-Create an `extra-llm-api-options.yaml` file with LoRA configuration:
+Create a `config.yaml` file with LoRA configuration:
 
 ```yaml
 lora_config:
@@ -535,7 +541,7 @@ lora_config:
 trtllm-bench --model /path/to/base/model \
   throughput \
   --dataset synthetic_lora_data.json \
-  --extra_llm_api_options extra-llm-api-options.yaml
+  --config config.yaml
 ```
 
 ```{note}
diff --git a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
index 43e2a1a46e..2f37c716cf 100644
--- a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
+++ b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
@@ -24,7 +24,7 @@ As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench
 
 ## Advanced Configuration
 
-For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file:
+For more granular control over AutoDeploy's behavior during benchmarking, use the `--config` flag with a YAML configuration file:
 
 ```bash
 trtllm-bench \
@@ -32,7 +32,7 @@ trtllm-bench \
   throughput \
   --dataset /tmp/synthetic_128_128.txt \
   --backend _autodeploy \
-  --extra_llm_api_options autodeploy_config.yaml
+  --config autodeploy_config.yaml
 ```
 
 ### Configuration Examples
diff --git a/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md
index 6e52fe4ea4..20693f6170 100644
--- a/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md
+++ b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md
@@ -30,13 +30,13 @@ curl -s http://localhost:8000/v1/chat/completions \
 
 ## Configuration via YAML
 
-Use `--extra_llm_api_options` to supply a YAML file that augments or overrides server/runtime settings.
+Use `--config` to supply a YAML file that augments or overrides server/runtime settings.
 
 ```bash
 trtllm-serve \
   meta-llama/Llama-3.1-8B \
   --backend _autodeploy \
-  --extra_llm_api_options autodeploy_config.yaml
+  --config autodeploy_config.yaml
 ```
 
 Example `autodeploy_config.yaml`:
diff --git a/docs/source/torch/features/lora.md b/docs/source/torch/features/lora.md
index d00a27d49a..ccf7561efb 100644
--- a/docs/source/torch/features/lora.md
+++ b/docs/source/torch/features/lora.md
@@ -157,7 +157,7 @@ llm = LLM(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -170,7 +170,7 @@ lora_config:
 ```bash
 python -m tensorrt_llm.commands.serve
      /path/to/model \
-    --extra_llm_api_options extra_llm_api_options.yaml
+    --config config.yaml
 ```
 
 ### Client Usage
@@ -198,7 +198,7 @@ response = client.completions.create(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -220,5 +220,5 @@ lora_config:
 ### Run trtllm-bench
 
 ```bash
-trtllm-bench --model $model_path throughput --dataset $dataset_path --extra_llm_api_options extra-llm-api-options.yaml --num_requests 64 --concurrency 16
+trtllm-bench --model $model_path throughput --dataset $dataset_path --config config.yaml --num_requests 64 --concurrency 16
 ```
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000000..3159bfe656
--- /dev/null
+++ b/examples/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/configs/README.md b/examples/configs/README.md
index b9a47281d2..dc633c8b2c 100644
--- a/examples/configs/README.md
+++ b/examples/configs/README.md
@@ -1,5 +1,5 @@
 # Recommended LLM API Configuration Settings
 
-This directory contains recommended [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/) performance settings for popular models. They can be used out-of-the-box with `trtllm-serve` via the `--extra_llm_api_options` CLI flag, or you can adjust them to your specific use case.
+This directory contains recommended [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/) performance settings for popular models. They can be used out-of-the-box with `trtllm-serve` via the `--config` CLI flag, or you can adjust them to your specific use case.
 
 For model-specific deployment guides, please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/deployment-guide/index.html).
diff --git a/examples/configs/__init__.py b/examples/configs/__init__.py
new file mode 100644
index 0000000000..3159bfe656
--- /dev/null
+++ b/examples/configs/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/configs/database/__init__.py b/examples/configs/database/__init__.py
new file mode 100644
index 0000000000..3159bfe656
--- /dev/null
+++ b/examples/configs/database/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 8b99f8845f..64dd80cbdf 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -23,10 +23,10 @@ cache_transceiver_config:
   kv_transfer_sender_future_timeout_ms: <int>
 ```
 
-The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below.
+The following is an example, consisting of the `ctx_config.yaml` and `gen_config.yaml` files needed in the sections below.
 
 ```yaml
-# ctx_extra-llm-api-config.yaml
+# ctx_config.yaml
 
 # The overlap scheduler for context servers is currently disabled, as it is
 # not yet supported in disaggregated context server architectures.
@@ -37,7 +37,7 @@ cache_transceiver_config:
 ```
 
 ```yaml
-# gen_extra-llm-api-config.yaml
+# gen_config.yaml
 
 cache_transceiver_config:
   backend: UCX
@@ -54,16 +54,16 @@ Suppose we have three CUDA devices on the same machine. The first two devices ar
 # Start context servers
 CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8001 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 &
+    --config ./ctx_config.yaml &> log_ctx_0 &
 
 CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8002 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 &
+    --config ./ctx_config.yaml &> log_ctx_1 &
 
 # Start generation server
 CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8003 \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 &
+    --config ./gen_config.yaml &> log_gen_0 &
 ```
 
 Once the context and generation servers are launched, you can launch the disaggregated
@@ -131,16 +131,16 @@ After starting the node and entering interactive mode, you can run the following
 # Start context servers
 CUDA_VISIBLE_DEVICES=0 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8001 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 &
+    --config ./ctx_config.yaml &> log_ctx_0 &
 
 CUDA_VISIBLE_DEVICES=1 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8002 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 &
+    --config ./ctx_config.yaml &> log_ctx_1 &
 
 # Start generation server
 CUDA_VISIBLE_DEVICES=2 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8003 \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 &
+    --config ./gen_config.yaml &> log_gen_0 &
 
 # Start proxy
 trtllm-llmapi-launch trtllm-serve disaggregated -c disagg_config.yaml
@@ -182,7 +182,7 @@ srun -A <account> -p <partition> -t <time> \
     --container-image=<container_image> \
     --container-mounts=<mount_paths> \
     --mpi=pmix \
-    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port $PORT --extra_llm_api_options $WORK/ctx_extra-llm-api-config.yaml"
+    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port $PORT --config $WORK/ctx_config.yaml"
 
 # Launch a generation with `tp_size=4` using one 4-GPU node.
 srun -A <account> -p <partition> -t <time> \
@@ -190,7 +190,7 @@ srun -A <account> -p <partition> -t <time> \
     --container-image=<container_image> \
     --container-mounts=<mount_paths> \
     --mpi=pmix \
-    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port $PORT --extra_llm_api_options $WORK/gen_extra-llm-api-config.yaml"
+    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port $PORT --config $WORK/gen_config.yaml"
 
 # Launch a proxy.
 # The above-mentioned value needs to be replaced with the IP address of the host machine accessible to external
@@ -241,20 +241,20 @@ Verify both checkpoints have the same KV cache dtype by checking `hf_quant_confi
 CUDA_VISIBLE_DEVICES=0 trtllm-serve meta-llama/Llama-3.1-8B-Instruct \
     --host localhost --port 8001 \
     --server_role CONTEXT \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml \
+    --config ./ctx_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_ctx_0 &
 
 CUDA_VISIBLE_DEVICES=1 trtllm-serve meta-llama/Llama-3.1-8B-Instruct \
     --host localhost --port 8002 \
     --server_role CONTEXT \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml \
+    --config ./ctx_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_ctx_1 &
 
 # Start generation server with FP8 quantized checkpoint
 CUDA_VISIBLE_DEVICES=2 trtllm-serve ./weights/Llama-3.1-8B-Instruct-FP8-KV-BF16 \
     --host localhost --port 8003 \
     --server_role GENERATION \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml \
+    --config ./gen_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 
 # Start disaggregated server
@@ -308,11 +308,11 @@ After this, you can enable the dynamic scaling feature for the use case above as
 
 ```bash
 # Context servers
-CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001  --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_0 &
-CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002  --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_1 &
+CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001  --server_role CONTEXT --config ./ctx_config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_0 &
+CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002  --server_role CONTEXT --config ./ctx_config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_1 &
 
 # Generation servers
-CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003  --server_role GENERATION --extra_llm_api_options ./gen_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
+CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003  --server_role GENERATION --config ./gen_config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 ```
 
 As for the disaggregated server, you should also specify the --metadata_server_config_file like the following
@@ -339,7 +339,7 @@ Users can add servers by directly launching them with trtllm-serve. For example,
 CUDA_VISIBLE_DEVICES=3 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8004 \
      --server_role GENERATION \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml \
+    --config ./gen_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 ```
 TensorRT LLM will automatically register any newly launched server with the ETCD server, allowing the router to send new requests to the added server.
diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
index e2ac1f7530..ae60ae8737 100644
--- a/examples/disaggregated/slurm/benchmark/start_worker.sh
+++ b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -63,4 +63,4 @@ fi
 ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
     trtllm-serve ${model_path} \
         --host $(hostname) --port ${port} \
-        --extra_llm_api_options ${config_file}
+        --config ${config_file}
diff --git a/examples/disaggregated/slurm/service_discovery_example/launch.slurm b/examples/disaggregated/slurm/service_discovery_example/launch.slurm
index 76cdaa6944..1b05ee1718 100644
--- a/examples/disaggregated/slurm/service_discovery_example/launch.slurm
+++ b/examples/disaggregated/slurm/service_discovery_example/launch.slurm
@@ -38,14 +38,14 @@ disagg_cluster:
   cluster_name: example_cluster
 EOL
 
-cat >${work_path}/ctx_extra-llm-api-config.yaml << EOL
+cat >${work_path}/ctx_config.yaml << EOL
 disable_overlap_scheduler: True
 cache_transceiver_config:
   backend: UCX
   max_tokens_in_buffer: 2048
 EOL
 
-cat >${work_path}/gen_extra-llm-api-config.yaml << EOL
+cat >${work_path}/gen_config.yaml << EOL
 cache_transceiver_config:
   backend: UCX
   max_tokens_in_buffer: 2048
@@ -63,11 +63,11 @@ srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 2 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --extra_llm_api_options ${work_path}/ctx_extra-llm-api-config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role context" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --config ${work_path}/ctx_config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role context" &
 
 # Launch a generation with `tp_size=4` using one 4-GPU node.
 srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 1 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --extra_llm_api_options ${work_path}/gen_extra-llm-api-config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role generation" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --config ${work_path}/gen_config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role generation" &
diff --git a/examples/disaggregated/slurm/simple_example/launch.slurm b/examples/disaggregated/slurm/simple_example/launch.slurm
index 6013b58162..577af570d6 100644
--- a/examples/disaggregated/slurm/simple_example/launch.slurm
+++ b/examples/disaggregated/slurm/simple_example/launch.slurm
@@ -17,14 +17,14 @@ srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 2 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --extra_llm_api_options ${work_path}/ctx_extra-llm-api-config.yaml" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --config ${work_path}/ctx_config.yaml" &
 
 # Launch a generation with `tp_size=4` using one 4-GPU node.
 srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 1 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --extra_llm_api_options ${work_path}/gen_extra-llm-api-config.yaml" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --config ${work_path}/gen_config.yaml" &
 
 # Launch a proxy.
 # The above-mentioned value needs to be replaced with the IP address of the host machine accessible to external
diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh
index f8167966a8..50c657ad50 100644
--- a/examples/llm-api/llm_mgmn_trtllm_bench.sh
+++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh
@@ -125,6 +125,6 @@ EOF
             --dataset $data_path \
             --backend pytorch \
             --tp 16 \
-            --extra_llm_api_options /tmp/pytorch_extra_args.txt \
+            --config /tmp/pytorch_extra_args.txt \
             $EXTRA_ARGS
     "
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index 1bb67546f9..68e51564c3 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -148,7 +148,7 @@ trtllm-bench --model ${DS_R1_NVFP4_MODEL_PATH} \
         --input-stdev 0 --output-stdev 0 \
         --num-requests 24
 
-cat <<EOF > /tmp/extra-llm-api-config.yml
+cat <<EOF > /tmp/config.yml
 cuda_graph_config:
   enable_padding: true
   batch_sizes: [1, 4, 8, 12]
@@ -161,7 +161,7 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} t
         --max_batch_size 12 \
         --max_num_tokens 65548 \
         --kv_cache_free_gpu_mem_fraction 0.6 \
-        --extra_llm_api_options /tmp/extra-llm-api-config.yml
+        --config /tmp/config.yml
 ```
 
 #### ISL-128k-OSL-1024
@@ -175,7 +175,7 @@ trtllm-bench --model ${DS_R1_NVFP4_MODEL_PATH} \
         --input-stdev 0 --output-stdev 0 \
         --num-requests 4
 
-cat <<EOF > /tmp/extra-llm-api-config.yml
+cat <<EOF > /tmp/config.yml
 cuda_graph_config:
   enable_padding: true
   batch_sizes: [1, 2]
@@ -190,7 +190,7 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} t
         --max_batch_size 2 \
         --max_num_tokens 131074 \
         --kv_cache_free_gpu_mem_fraction 0.3 \
-        --extra_llm_api_options /tmp/extra-llm-api-config.yml
+        --config /tmp/config.yml
 ```
 
 ## Evaluation
@@ -199,7 +199,7 @@ Evaluate the model accuracy using `trtllm-eval`.
 
 1. (Optional) Prepare an advanced configuration file:
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 enable_attention_dp: true
 EOF
 ```
@@ -209,7 +209,7 @@ EOF
 trtllm-eval --model  <YOUR_MODEL_DIR> \
   --tp_size 8 \
   --kv_cache_free_gpu_memory_fraction 0.8 \
-  --extra_llm_api_options ./extra-llm-api-config.yml \
+  --config ./config.yml \
   mmlu
 ```
 
@@ -218,7 +218,7 @@ trtllm-eval --model  <YOUR_MODEL_DIR> \
 trtllm-eval --model  <YOUR_MODEL_DIR> \
   --tp_size 8 \
   --kv_cache_free_gpu_memory_fraction 0.8 \
-  --extra_llm_api_options ./extra-llm-api-config.yml \
+  --config ./config.yml \
   gsm8k
 ```
 
@@ -229,7 +229,7 @@ trtllm-eval --model  <YOUR_MODEL_DIR> \
 trtllm-eval --model  <YOUR_MODEL_DIR> \
   --tp_size 8 \
   --kv_cache_free_gpu_memory_fraction 0.8 \
-  --extra_llm_api_options ./extra-llm-api-config.yml \
+  --config ./config.yml \
   gpqa_diamond \
   --apply_chat_template
 ```
@@ -243,7 +243,7 @@ To serve the model using `trtllm-serve`:
 
 #### B200 FP4 min-latency config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 1024
@@ -257,7 +257,7 @@ EOF
 
 #### B200 FP4 max-throughput config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -294,7 +294,7 @@ EOF
 
 #### B200 FP8 min-latency config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 1024
@@ -312,7 +312,7 @@ EOF
 
 #### B200 FP8 max-throughput config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 512
@@ -342,7 +342,7 @@ trtllm-serve \
   --tp_size 8 \
   --ep_size 8 \
   --pp_size 1 \
-  --extra_llm_api_options ./extra-llm-api-config.yml
+  --config ./config.yml
 ```
 It's possible seeing OOM issues on some configs. Considering reducing `kv_cache_free_gpu_mem_fraction` to a smaller value as a workaround. We're working on the investigation and addressing the problem. If you are using max-throughput config, reduce `max_num_tokens` to `3072` to avoid OOM issues.
 
@@ -370,7 +370,7 @@ For example, you can launch a single context server on port 8001 with:
 ```bash
 export TRTLLM_USE_UCX_KVCACHE=1
 
-cat >./ctx-extra-llm-api-config.yml <<EOF
+cat >./ctx_config.yml <<EOF
 print_iter_log: true
 enable_attention_dp: true
 EOF
@@ -386,7 +386,7 @@ trtllm-serve \
   --ep_size 8 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./ctx-extra-llm-api-config.yml &> output_ctx &
+  --config ./ctx_config.yml &> output_ctx &
 ```
 
 And you can launch two generation servers on port 8002 and 8003 with:
@@ -394,7 +394,7 @@ And you can launch two generation servers on port 8002 and 8003 with:
 ```bash
 export TRTLLM_USE_UCX_KVCACHE=1
 
-cat >./gen-extra-llm-api-config.yml <<EOF
+cat >./gen_config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -424,7 +424,7 @@ trtllm-serve \
   --ep_size 8 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./gen-extra-llm-api-config.yml \
+  --config ./gen_config.yml \
   &> output_gen_${port} & \
 done
 ```
@@ -483,7 +483,7 @@ The model configuration file is located at https://github.com/triton-inference-s
 model: <replace with the deepseek model or path to the checkpoints>
 backend: "pytorch"
 ```
-Additional configs similar to `extra-llm-api-config.yml` can be added to the yaml file and will be used to configure the LLM model. At the minimum, `tensor_parallel_size` needs to be set to 8 on H200 and B200 machines and 16 on H100.
+Additional configs similar to `config.yml` can be added to the yaml file and will be used to configure the LLM model. At the minimum, `tensor_parallel_size` needs to be set to 8 on H200 and B200 machines and 16 on H100.
 
 The initial loading of the model can take around one hour and the following runs will take advantage of the weight caching.
 
@@ -592,7 +592,7 @@ mpirun \
 -H <HOST1>:8,<HOST2>:8 \
 -mca plm_rsh_args "-p 2233" \
 --allow-run-as-root -n 16 \
-trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /models/DeepSeek-V3 throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/tensorrt_llm/dataset_isl1000.txt --tp 16 --ep 8 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options /workspace/tensorrt_llm/extra-llm-api-config.yml --concurrency 4096 --streaming
+trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /models/DeepSeek-V3 throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/tensorrt_llm/dataset_isl1000.txt --tp 16 --ep 8 --kv_cache_free_gpu_mem_fraction 0.95 --config /workspace/tensorrt_llm/config.yml --concurrency 4096 --streaming
 ```
 
 #### Slurm
@@ -604,20 +604,20 @@ trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /
   --container-image=<CONTAINER_IMG> \
   --container-mounts=/workspace:/workspace \
   --container-workdir /workspace \
-  bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
+  bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --config ./config.yml"
 ```
 
 
 #### Example: Multi-node benchmark on GB200 Slurm cluster
 
-Step 1: Prepare dataset and `extra-llm-api-config.yml`.
+Step 1: Prepare dataset and `config.yml`.
 ```bash
 trtllm-bench --model /path/to/DeepSeek-R1 \
     prepare-dataset --output /tmp/dataset.txt \
     token-norm-dist --num-requests=49152 \
     --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0
 
-cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
+cat >/path/to/TensorRT-LLM/config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -678,7 +678,7 @@ trtllm-llmapi-launch trtllm-bench \
     --concurrency 3072 \
     --dataset /path/to/dataset.txt \
     --tp 8 --pp 1 --ep 8 --kv_cache_free_gpu_mem_fraction 0.85 \
-    --extra_llm_api_options ./extra-llm-api-config.yml --warmup 0
+    --config ./config.yml --warmup 0
 ```
 
 Step 4: Submit the job to Slurm cluster to launch the benchmark by executing:
@@ -730,7 +730,7 @@ trtllm-bench \
       --tp 8 \
       --ep 8 \
       --kv_cache_free_gpu_mem_fraction 0.9 \
-      --extra_llm_api_options /workspace/extra-llm-api-config.yml \
+      --config /workspace/config.yml \
       --concurrency ${CONCURRENCY} \
       --num_requests ${NUM_REQUESTS} \
       --streaming \
@@ -751,7 +751,7 @@ mpirun -H <HOST1>:8,<HOST2>:8 \
       --tp 16 \
       --ep 16 \
       --kv_cache_free_gpu_mem_fraction 0.9 \
-      --extra_llm_api_options /workspace/extra-llm-api-config.yml \
+      --config /workspace/config.yml \
       --concurrency ${CONCURRENCY} \
       --num_requests ${NUM_REQUESTS} \
       --streaming \
@@ -790,7 +790,7 @@ To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following show
 
 **Option 2: PyTorch backend config**
 
-Alternatively, configure FP8 MLA through the `kv_cache_dtype` of the PyTorch backend config. An example is to use `--kv_cache_dtype` of `quickstart_advanced.py`. Also, you can edit `extra-llm-api-config.yml` consumed by `--extra_llm_api_options` of `trtllm-serve`, `trtllm-bench` and so on:
+Alternatively, configure FP8 MLA through the `kv_cache_dtype` of the PyTorch backend config. An example is to use `--kv_cache_dtype` of `quickstart_advanced.py`. Also, you can edit `config.yml` consumed by `--config` of `trtllm-serve`, `trtllm-bench` and so on:
 ```yaml
 # ...
 kv_cache_dtype: fp8
diff --git a/examples/models/core/gemma/README.md b/examples/models/core/gemma/README.md
index a78f3c8ffe..9f0085c5fe 100644
--- a/examples/models/core/gemma/README.md
+++ b/examples/models/core/gemma/README.md
@@ -688,7 +688,7 @@ For example, you can launch a single context server on port 8001 with:
 ```bash
 export TRTLLM_USE_UCX_KVCACHE=1
 
-cat >./ctx-extra-llm-api-config.yml <<EOF
+cat >./ctx_config.yml <<EOF
 print_iter_log: true
 disable_overlap_scheduler: true
 kv_cache_config:
@@ -705,14 +705,14 @@ trtllm-serve \
   --ep_size 2 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./ctx-extra-llm-api-config.yml \
+  --config ./ctx_config.yml \
   &> output_ctx_8001 &
 ```
 
 Then launch a single generation server on port 8002 with:
 
 ```bash
-cat >./gen-extra-llm-api-config.yml <<EOF
+cat >./gen_config.yml <<EOF
 print_iter_log: true
 kv_cache_config:
   max_attention_window: [512, 512, 512, 512, 512, 32768]
@@ -728,7 +728,7 @@ trtllm-serve \
   --ep_size 2 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./gen-extra-llm-api-config.yml \
+  --config ./gen_config.yml \
   &> output_gen_8002 &
 ```
 
diff --git a/examples/models/core/gpt_oss/README.md b/examples/models/core/gpt_oss/README.md
index 6606bc827f..85cb21f6eb 100644
--- a/examples/models/core/gpt_oss/README.md
+++ b/examples/models/core/gpt_oss/README.md
@@ -133,7 +133,7 @@ export TRITON_ROOT=/local/user/triton
 
 3. **Select Triton as the MoE backend**
 
-• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--extra_llm_api_options`:
+• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--config`:
 
 ```yaml
 moe_config:
diff --git a/examples/models/core/kimi_k2/README.md b/examples/models/core/kimi_k2/README.md
index 1dd3e353c5..c951ae29ac 100644
--- a/examples/models/core/kimi_k2/README.md
+++ b/examples/models/core/kimi_k2/README.md
@@ -56,7 +56,7 @@ The next section is an example that deploys the K2 model using TensorRT-LLM and
 First, launch a server using trtllm-serve:
 
 ```bash
-cat > ./extra_llm_api_options.yaml <<EOF
+cat > ./config.yaml <<EOF
 # define your extra parameters here
 cuda_graph_config:
   batch_sizes:
@@ -70,7 +70,7 @@ trtllm-serve  \
     --backend pytorch \
     --tp_size 8 \
     --ep_size 8 \
-    --extra_llm_api_options extra_llm_api_options.yaml
+    --config config.yaml
 ```
 
 Run the script [kimi_k2_tool_calling_example.py](./kimi_k2_tool_calling_example.py), which performs the following steps:
diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md
index df26ac1ad6..560296daa6 100644
--- a/examples/models/core/llama/README.md
+++ b/examples/models/core/llama/README.md
@@ -1542,7 +1542,7 @@ This section provides the steps to run LLaMa-3.3 70B model FP8 precision on PyTo
 
 ### Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 stream_interval: 10
 cuda_graph_config:
   max_batch_size: 1024
@@ -1566,7 +1566,7 @@ trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
     --max_batch_size 1024 \
     --trust_remote_code \
     --num_postprocess_workers 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 ### Run performance benchmarks
diff --git a/examples/models/core/llama4/README.md b/examples/models/core/llama4/README.md
index a6c02070e9..450ea786b1 100644
--- a/examples/models/core/llama4/README.md
+++ b/examples/models/core/llama4/README.md
@@ -25,7 +25,7 @@ This section provides the steps to launch TensorRT LLM server and run performanc
 
 #### 1. Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 enable_attention_dp: true
 stream_interval: 10
 cuda_graph_config:
@@ -50,7 +50,7 @@ trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --ep_size 8 \
     --num_postprocess_workers 2 \
     --trust_remote_code \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 
@@ -75,7 +75,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
 
 #### 1. Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 enable_attention_dp: false
 enable_min_latency: true
 stream_interval: 10
@@ -101,7 +101,7 @@ trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --tp_size 8 \
     --ep_size 1 \
     --trust_remote_code \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 
@@ -125,7 +125,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
 
 #### 1. Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 stream_interval: 10
 cuda_graph_config:
   max_batch_size: 1024
@@ -147,7 +147,7 @@ trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --ep_size 2 \
     --num_postprocess_workers 2 \
     --trust_remote_code \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 
diff --git a/examples/models/core/mistral_large_3/README.md b/examples/models/core/mistral_large_3/README.md
index dfd3fd0c28..ffe2e50f79 100644
--- a/examples/models/core/mistral_large_3/README.md
+++ b/examples/models/core/mistral_large_3/README.md
@@ -35,7 +35,7 @@ checkpoint_format: mistral
 mpirun -n 1 --allow-run-as-root --oversubscribe python3 -m tensorrt_llm.commands.serve serve \
     ${mistral_large_3_model_path} \
     --host localhost --port 8001 --backend pytorch \
-    --extra_llm_api_options serve.yml \
+    --config serve.yml \
     --tokenizer ${mistral_large_3_model_path} \
     2>&1 | tee serve_debug.log &
 
diff --git a/examples/models/core/multimodal/README.md b/examples/models/core/multimodal/README.md
index d92ec168bb..96ba6102a8 100644
--- a/examples/models/core/multimodal/README.md
+++ b/examples/models/core/multimodal/README.md
@@ -414,7 +414,7 @@ trtllm-serve ${MODEL_NAME}/ \
     --tp_size 1 \
     --port 8000 \
     --max_batch_size 4 \
-    --extra_llm_api_options extra-llm-api-options.yaml
+    --config extra-llm-api-options.yaml
 ```
 
 ### Supported Model Variants
diff --git a/examples/models/core/nemotron/README_nano-v2-vl.md b/examples/models/core/nemotron/README_nano-v2-vl.md
index 8988df8a43..956ce4d8a9 100644
--- a/examples/models/core/nemotron/README_nano-v2-vl.md
+++ b/examples/models/core/nemotron/README_nano-v2-vl.md
@@ -47,7 +47,7 @@ TLLM_VIDEO_PRUNING_RATIO=0.9 python3 examples/llm-api/quickstart_multimodal.py -
 
 ```bash
 # Create extra config file.
-cat > ./extra-llm-api-config.yml << EOF
+cat > ./config.yml << EOF
 kv_cache_config:
   enable_block_reuse: false
   mamba_ssm_cache_dtype: float32
@@ -63,7 +63,7 @@ nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\
 --max_num_tokens 131072 \
 --trust_remote_code \
 --media_io_kwargs "{\"video\": {\"fps\": 2, \"num_frames\": 128} }" \
---extra_llm_api_options extra-llm-api-config.yml
+--config config.yml
 
 # CMD to launch serve with EVS (video_pruning_ratio=0.9).
 TLLM_VIDEO_PRUNING_RATIO=0.9 trtllm-serve  \
@@ -75,7 +75,7 @@ nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\
 --max_num_tokens 131072 \
 --trust_remote_code \
 --media_io_kwargs "{\"video\": {\"fps\": 2, \"num_frames\": 128} }" \
---extra_llm_api_options extra-llm-api-config.yml
+--config config.yml
 ```
 
 # Known issue:
diff --git a/examples/models/core/phi/phi4-mm.md b/examples/models/core/phi/phi4-mm.md
index 7d48cad2a2..1f7657b80c 100644
--- a/examples/models/core/phi/phi4-mm.md
+++ b/examples/models/core/phi/phi4-mm.md
@@ -22,7 +22,7 @@ python examples/llm-api/quickstart_multimodal.py --model_dir <model_folder_path>
 
 ### TRTLLM-serve
 ```
-cat > lora-extra-llm-api-config.yml<<EOF
+cat > lora_llmapi_config.yml<<EOF
 kv_cache_config:
     free_gpu_memory_fraction: 0.6
 lora_config:
@@ -46,7 +46,7 @@ trtllm-serve  \
 <model_folder_path> \
 --backend pytorch \
 --trust_remote_code \
---extra_llm_api_options lora-extra-llm-api-config.yml
+--config lora_llmapi_config.yml
 ```
 
 ```
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 051d7811f9..5474e25969 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -688,7 +688,7 @@ To run the benchmark, we suggest using the `trtllm-bench` tool. Please refer to
 #!/bin/bash
 
 folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
-path_config=extra-llm-api-config.yml
+path_config=config.yml
 num_gpus=8
 ep_size=8
 max_input_len=1024
@@ -717,7 +717,7 @@ trtllm-bench --model ${folder_model} --model_path ${folder_model} throughput \
   --tp ${num_gpus}\
   --ep ${ep_size} \
   --kv_cache_free_gpu_mem_fraction ${kv_cache_free_gpu_mem_fraction} \
-  --extra_llm_api_options ${path_config} \
+  --config ${path_config} \
   --concurrency ${concurrency} \
   --num_requests $(( concurrency * 5 )) \
   --warmup 0 \
@@ -756,7 +756,7 @@ EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml
 To serve the model using `trtllm-serve`:
 
 ```bash
-trtllm-serve Qwen3-30B-A3B/ --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve Qwen3-30B-A3B/ --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 To query the server, you can start with a `curl` command:
@@ -781,7 +781,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 export TRTLLM_DIR=/app/tensorrt_llm
 export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/curated/qwen3-disagg-prefill.yaml"
 
-trtllm-serve Qwen3-30B-A3B/ --port 8001 --extra_llm_api_options ${EXTRA_LLM_API_FILE} &> output_ctx &
+trtllm-serve Qwen3-30B-A3B/ --port 8001 --config ${EXTRA_LLM_API_FILE} &> output_ctx &
 ```
 
 And you can launch two generation servers on port 8002 and 8003 with:
@@ -792,7 +792,7 @@ export TRTLLM_DIR=/app/tensorrt_llm
 export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml"
 
 for port in {8002..8003}; do \
-trtllm-serve Qwen3-30B-A3B/ --port ${port} --extra_llm_api_options ${EXTRA_LLM_API_FILE} &> output_gen_${port} & \
+trtllm-serve Qwen3-30B-A3B/ --port ${port} --config ${EXTRA_LLM_API_FILE} &> output_gen_${port} & \
 done
 ```
 
@@ -849,7 +849,7 @@ Currently, there are some limitations when enabling Eagle3:
 1. `attention_dp` is not supported. Please disable it or do not set the related flag (it is disabled by default).
 2. If you want to use `enable_block_reuse`, the kv cache type of the target model and the draft model must be the same. Since the draft model only supports fp16/bf16, you need to disable `enable_block_reuse` when using fp8 kv cache.
 
-Example `extra-llm-api-config.yml` snippet for Eagle3:
+Example `config.yml` snippet for Eagle3:
 
 ```bash
 echo "
diff --git a/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh b/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh
index 61ee520161..00291738a7 100644
--- a/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh
+++ b/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh
@@ -129,14 +129,14 @@ if [[ "$BACKEND" == "mpi" ]]; then
     export CUDA_VISIBLE_DEVICES=0
 fi
 
-trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --extra_llm_api_options extra_llm_config.yaml &> output_ctx0 &
+trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --config extra_llm_config.yaml &> output_ctx0 &
 
 if [[ "$BACKEND" == "mpi" ]]; then
     export CUDA_VISIBLE_DEVICES=1
 fi
 # Launching generation servers
 echo "Launching generation servers..."
-trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --extra_llm_api_options extra_llm_config.yaml &> output_gen0 &
+trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --config extra_llm_config.yaml &> output_gen0 &
 
 # Launching disaggregated server
 echo "Launching disaggregated server..."
diff --git a/examples/serve/deepseek_r1_reasoning_parser.sh b/examples/serve/deepseek_r1_reasoning_parser.sh
index b2336d02b2..9117921655 100644
--- a/examples/serve/deepseek_r1_reasoning_parser.sh
+++ b/examples/serve/deepseek_r1_reasoning_parser.sh
@@ -1,6 +1,6 @@
 #! /usr/bin/env bash
 
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 512
@@ -19,5 +19,5 @@ trtllm-serve \
     --trust_remote_code \
     --max_batch_size 1024 --max_num_tokens 8192 \
     --tp_size 8 --ep_size 8 --pp_size 1 \
-    --extra_llm_api_options ./extra-llm-api-config.yml \
+    --config ./config.yml \
     --reasoning_parser deepseek-r1
diff --git a/examples/serve/openai_completion_client_json_schema.py b/examples/serve/openai_completion_client_json_schema.py
index 56e5a351a0..6dd2a15bd2 100644
--- a/examples/serve/openai_completion_client_json_schema.py
+++ b/examples/serve/openai_completion_client_json_schema.py
@@ -1,7 +1,7 @@
 ### :title OpenAI Completion Client with JSON Schema
 
 # This example requires to specify `guided_decoding_backend` as
-# `xgrammar` or `llguidance` in the extra_llm_api_options.yaml file.
+# `xgrammar` or `llguidance` in the config.yaml file.
 import json
 
 from openai import OpenAI
diff --git a/examples/sparse_attention/RocketKV.md b/examples/sparse_attention/RocketKV.md
index c320350c9f..178fe04d5b 100644
--- a/examples/sparse_attention/RocketKV.md
+++ b/examples/sparse_attention/RocketKV.md
@@ -93,7 +93,7 @@ python3 ../llm-api/llm_sparse_attention.py \
 
 ### Usage with `trtllm-bench` and `trtllm-serve`
 
-Sparse attention options must be specified via `--extra_llm_api_options config.yaml` for both `trtllm-bench` and `trtllm-serve`. All sparse attetnion options can be specified in this YAML file and the argument names/valid values are the same as in their corresponding configuration described in the Configuration Arguments section. For example, a YAML configuration could look like this:
+Sparse attention options must be specified via `--config config.yaml` for both `trtllm-bench` and `trtllm-serve`. All sparse attetnion options can be specified in this YAML file and the argument names/valid values are the same as in their corresponding configuration described in the Configuration Arguments section. For example, a YAML configuration could look like this:
 
 ```
 backend: pytorch
@@ -110,13 +110,13 @@ enable_chunked_prefill: false
 
 Run the command with the config file:
 ```bash
-trtllm-eval/trtllm-bench/trtllm-serve --model <model_path> --extra_llm_api_options extra_config.yaml ...
+trtllm-eval/trtllm-bench/trtllm-serve --model <model_path> --config extra_config.yaml ...
 ```
 
 For example, users can evaluate a model with trtllm-eval on LongBenchV2 task like this:
 
 ```bash
-trtllm-eval --model <path_to_model> --extra_llm_api_options extra_config.yaml longbench_v2 --max_output_length 1024 ...
+trtllm-eval --model <path_to_model> --config extra_config.yaml longbench_v2 --max_output_length 1024 ...
 ```
 
 ## Configuration Arguments
diff --git a/examples/wide_ep/ep_load_balancer/README.md b/examples/wide_ep/ep_load_balancer/README.md
index bb324a132b..ed143df741 100644
--- a/examples/wide_ep/ep_load_balancer/README.md
+++ b/examples/wide_ep/ep_load_balancer/README.md
@@ -24,7 +24,7 @@ Prepare a dataset following the [benchmarking documentation](https://github.com/
 Run 32-way expert parallelism inference on the prepared dataset. Please refer to the [LLM API MGMN example](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/llm_mgmn_trtllm_bench.sh) for details on running `trtllm-bench` on Slurm.
 
 ```bash
-cat > ./extra_llm_api_options.yaml <<EOF
+cat > ./config.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
 moe_backend: WideEP
@@ -39,7 +39,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 32 \
     --ep 32 \
-    --extra_llm_api_options ./extra_llm_api_options.yaml \
+    --config ./config.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --dataset ./dataset.json \
     --warmup 0 \
@@ -115,7 +115,7 @@ export EXPERT_STATISTIC_ITER_RANGE=100-200
 Run 36-way expert parallelism inference with the EPLB configuration incorporated:
 
 ```bash
-cat > ./extra_llm_api_options_eplb.yaml <<EOF
+cat > ./config_eplb.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
 moe_config:
@@ -130,7 +130,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 36 \
     --ep 36 \
-    --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
+    --config ./config_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --dataset ./dataset.json \
     --warmup 0 \
@@ -181,7 +181,7 @@ EOF
 Run 36-way expert parallelism inference with the EPLB configuration incorporated:
 
 ```bash
-cat > ./extra_llm_api_options_eplb.yaml <<EOF
+cat > ./config_eplb.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
 moe_config:
@@ -196,7 +196,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 36 \
     --ep 36 \
-    --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
+    --config ./config_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --dataset ./dataset.json \
     --warmup 0 \
diff --git a/scripts/generate_config_table.py b/scripts/generate_config_table.py
index 3c68c7edcb..724b71397e 100644
--- a/scripts/generate_config_table.py
+++ b/scripts/generate_config_table.py
@@ -19,10 +19,19 @@ import sys
 from collections import defaultdict
 from pathlib import Path
 
-from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList, assign_profile
-
 SCRIPT_DIR = Path(__file__).parent.resolve()
 REPO_ROOT = SCRIPT_DIR.parent
+
+# Add repo root to path for examples.configs.database import
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from examples.configs.database.database import (  # noqa: E402
+    DATABASE_LIST_PATH,
+    RecipeList,
+    assign_profile,
+)
+
 MODEL_INFO = {
     "deepseek-ai/DeepSeek-R1-0528": {
         "display_name": "DeepSeek-R1",
@@ -57,7 +66,7 @@ def generate_rst(yaml_path, output_file=None):
     lines = []
 
     # Include note_sections.rst at the top (relative include for Sphinx)
-    lines.append(".. include:: note_sections.rst")
+    lines.append(".. include:: ../_includes/note_sections.rst")
     lines.append("   :start-after: .. start-note-traffic-patterns")
     lines.append("   :end-before: .. end-note-traffic-patterns")
     lines.append("")
@@ -115,7 +124,7 @@ def generate_rst(yaml_path, output_file=None):
                 profile = assign_profile(n, idx, conc)
 
                 full_config_path = config_path
-                command = f"trtllm-serve {model} --extra_llm_api_options ${{TRTLLM_DIR}}/{full_config_path}"
+                command = f"trtllm-serve {model} --config ${{TRTLLM_DIR}}/{full_config_path}"
 
                 config_filename = os.path.basename(full_config_path)
 
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 894114c0f4..49dd8d8252 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -177,7 +177,7 @@ def launch_disaggregated_llm(
 
         ctx_server_args = ctx_args + [
             "--port",
-            str(port), "--extra_llm_api_options", ctx_server_config_path,
+            str(port), "--config", ctx_server_config_path,
             f"--tp_size={ctx_tp}", f"--pp_size={ctx_pp}", f"--cp_size={ctx_cp}"
         ]
         if "max_num_tokens" in ctx_server_config:
@@ -200,7 +200,7 @@ def launch_disaggregated_llm(
 
         gen_server_args = gen_args + [
             "--port",
-            str(port), "--extra_llm_api_options", gen_server_config_path,
+            str(port), "--config", gen_server_config_path,
             f"--tp_size={gen_tp}", f"--pp_size={gen_pp}", f"--cp_size={gen_cp}"
         ]
         if "max_num_tokens" in gen_server_config:
diff --git a/tests/integration/defs/disaggregated/test_auto_scaling.py b/tests/integration/defs/disaggregated/test_auto_scaling.py
index 488cac6979..f270570e1e 100644
--- a/tests/integration/defs/disaggregated/test_auto_scaling.py
+++ b/tests/integration/defs/disaggregated/test_auto_scaling.py
@@ -159,7 +159,7 @@ def _run_worker(model_name,
             "localhost",
             "--port",
             str(port),
-            "--extra_llm_api_options",
+            "--config",
             worker_config_path,
             "--server_role",
             "context" if role.startswith("ctx") else "generation",
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index a0d325c737..72b42cd592 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -207,8 +207,8 @@ def generate_worker_commands(model_path, config, server_config,
         host, port = url.split(':')
         cmd = [
             'trtllm-serve', model_path, '--host', host, '--port', port,
-            '--backend', config['backend'], '--extra_llm_api_options',
-            extra_config_file, '--server_role', server_role
+            '--backend', config['backend'], '--config', extra_config_file,
+            '--server_role', server_role
         ]
         worker_commands.append(cmd)
     return worker_commands
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
index a495f35faf..aa3a293011 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
@@ -61,7 +61,7 @@ def start_context_server(config,
     """Start a context server on specified GPU and port."""
     cmd = [
         "trtllm-serve", config['model_path'], "--host", "localhost", "--port",
-        str(port), "--extra_llm_api_options", f"./{CONTEXT_CONFIG_FILE}",
+        str(port), "--config", f"./{CONTEXT_CONFIG_FILE}",
         "--metadata_server_config_file", ETCD_CONFIG_FILE, "--server_role",
         "CONTEXT"
     ]
@@ -87,7 +87,7 @@ def start_generation_server(config,
     """Start a generation server on specified GPU and port."""
     cmd = [
         "trtllm-serve", config['model_path'], "--host", "localhost", "--port",
-        str(port), "--extra_llm_api_options", f"./{GENERATION_CONFIG_FILE}",
+        str(port), "--config", f"./{GENERATION_CONFIG_FILE}",
         "--metadata_server_config_file", ETCD_CONFIG_FILE, "--server_role",
         "GENERATION"
     ]
diff --git a/tests/integration/defs/perf/README_release_test.md b/tests/integration/defs/perf/README_release_test.md
index 2cfbc5ed7e..cf96278c34 100644
--- a/tests/integration/defs/perf/README_release_test.md
+++ b/tests/integration/defs/perf/README_release_test.md
@@ -98,10 +98,10 @@ if self._config.backend == "pytorch":
     config = get_model_yaml_config(self._config.to_string(),
                                    lora_dirs=self.lora_dirs)
     print_info(f"pytorch model config: {config}")
-    with open('extra-llm-api-config.yml', 'w') as f:
+    with open('config.yml', 'w') as f:
         yaml.dump(config, f, default_flow_style=False)
     benchmark_cmd += [
-        f"--extra_llm_api_options=extra-llm-api-config.yml"
+        f"--config=config.yml"
     ]
 ```
 
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index f6d81460fe..980f0d1160 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -626,8 +626,8 @@ class ServerConfig:
             numa_bind_cmd = ["numactl", "-m 0,1"]
 
         cmd = numa_bind_cmd + [
-            "trtllm-serve", self.model_path, "--backend", "pytorch",
-            "--extra_llm_api_options", config_path
+            "trtllm-serve", self.model_path, "--backend", "pytorch", "--config",
+            config_path
         ]
         return cmd
 
@@ -2041,9 +2041,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                 print_info(f"pytorch/TRT model config: {config}")
                 with open(pytorch_config_path, 'w') as f:
                     yaml.dump(config, f, default_flow_style=False)
-                benchmark_cmd += [
-                    f"--extra_llm_api_options={pytorch_config_path}"
-                ]
+                benchmark_cmd += [f"--config={pytorch_config_path}"]
                 # If guided_decoding_backend is set, we need to initialize tokenizer
                 if config.get('guided_decoding_backend') is not None:
                     benchmark_cmd += ["--no_skip_tokenizer_init"]
@@ -2071,9 +2069,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
             print_info(f"_autodeploy model config: {autodeploy_config}")
             with open(autodeploy_config_path, 'w') as f:
                 yaml.dump(autodeploy_config, f, default_flow_style=False)
-            benchmark_cmd += [
-                f"--extra_llm_api_options={autodeploy_config_path}"
-            ]
+            benchmark_cmd += [f"--config={autodeploy_config_path}"]
         # for sampler options
         sampler_options_path = os.path.join(engine_dir, "sampler_options.yml")
         if not os.path.exists(sampler_options_path):
@@ -2849,8 +2845,8 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                 self._config.gen_server_workers)
         ])
 
-        ctx_cmd = f'CUDA_VISIBLE_DEVICES={ctx_gpu_list} trtllm-serve {model_dir} --host localhost --port 8001 --extra_llm_api_options {ctx_config_path}'
-        gen_cmd = f'CUDA_VISIBLE_DEVICES={gen_gpu_list} trtllm-serve {model_dir} --host localhost --port 8002 --extra_llm_api_options {gen_config_path}'
+        ctx_cmd = f'CUDA_VISIBLE_DEVICES={ctx_gpu_list} trtllm-serve {model_dir} --host localhost --port 8001 --config {ctx_config_path}'
+        gen_cmd = f'CUDA_VISIBLE_DEVICES={gen_gpu_list} trtllm-serve {model_dir} --host localhost --port 8002 --config {gen_config_path}'
         return ctx_cmd, gen_cmd
 
     def _get_disagg_server_deploy_command(self):
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index da97a675f0..ee6aae803a 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -589,7 +589,7 @@ def stress_test(config,
         str(test_server_config.max_num_tokens),
         "--kv_cache_free_gpu_memory_fraction",
         str(test_server_config.kv_cache_free_gpu_memory_fraction),
-        "--extra_llm_api_options",
+        "--config",
         extra_llm_options_path,
     ])
 
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index b75781bf0f..ab8ff625f4 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -559,7 +559,7 @@ class BenchRunner:
             benchmark_cmd += " --backend tensorrt"
 
         if self.extra_llm_api_options:
-            benchmark_cmd += f" --extra_llm_api_options {self.extra_llm_api_options}"
+            benchmark_cmd += f" --config {self.extra_llm_api_options}"
         if self.concurrency:
             benchmark_cmd += f" --concurrency {self.concurrency}"
         if self.num_requests:
@@ -723,7 +723,7 @@ def test_trtllm_bench_invalid_token_pytorch(llm_root, llm_venv, model_name,
                 f"--model_path {llama_model_root} " \
                 f"throughput " \
                 f"--dataset {str(dataset_path)} --backend pytorch " \
-                f"--extra_llm_api_options {extra_options_path} " \
+                f"--config {extra_options_path} " \
                 f"> {output_path} 2>&1"
         # Check clean shutdown (no hang)
         with pytest.raises(subprocess.CalledProcessError) as exc_info:
@@ -899,7 +899,7 @@ def test_trtllm_bench_sanity(llm_root, llm_venv, engine_dir, model_subdir,
 
     assert not pytorch_backend_config
     if use_extra_config:
-        benchmark_cmd += f" --extra_llm_api_options {temp_extra_llm_api_options_file}"
+        benchmark_cmd += f" --config {temp_extra_llm_api_options_file}"
     check_call(benchmark_cmd, shell=True)
 
 
@@ -950,7 +950,7 @@ def test_trtllm_bench_pytorch_backend_sanity(llm_root, llm_venv,
         "Meta-Llama-3.1-8B-NVFP4": 10.2
     }
     if use_extra_config:
-        benchmark_cmd += f" --extra_llm_api_options {temp_extra_llm_api_options_file}"
+        benchmark_cmd += f" --config {temp_extra_llm_api_options_file}"
 
     model_id = llama_model_root.split(r"/")[-1]
     if "nvfp4-quantized" in llama_model_root:
diff --git a/tests/unittest/tools/test_config_database_sync.py b/tests/unittest/tools/test_config_database_sync.py
index 92a4243166..5910e79add 100644
--- a/tests/unittest/tools/test_config_database_sync.py
+++ b/tests/unittest/tools/test_config_database_sync.py
@@ -13,23 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib.util
 import os
-import sys
 import tempfile
 import unittest
 from pathlib import Path
 
-# Add scripts directory to path
-REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
-SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts")
-sys.path.insert(0, SCRIPTS_DIR)
+REPO_ROOT = Path(__file__).parent.parent.parent.parent.resolve()
 
-from generate_config_database_tests import (  # noqa: E402
-    PERF_SANITY_DIR,
-    TEST_LIST_PATH,
-    generate_tests,
+# Dynamically load generate_config_table module without modifying sys.path
+_spec = importlib.util.spec_from_file_location(
+    "generate_config_table", REPO_ROOT / "scripts" / "generate_config_table.py"
 )
-from generate_config_table import generate_rst  # noqa: E402
+_module = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_module)
+generate_rst = _module.generate_rst
+
+# Dynamically load generate_config_database_tests module without modifying sys.path
+_db_spec = importlib.util.spec_from_file_location(
+    "generate_config_database_tests",
+    REPO_ROOT / "scripts" / "generate_config_database_tests.py",
+)
+_db_module = importlib.util.module_from_spec(_db_spec)
+_db_spec.loader.exec_module(_db_module)
+generate_tests = _db_module.generate_tests
+TEST_LIST_PATH = _db_module.TEST_LIST_PATH
+PERF_SANITY_DIR = _db_module.PERF_SANITY_DIR
 
 
 class TestConfigDatabaseSync(unittest.TestCase):