From 9f283f330b0d5b1a0e9fc646d17ab652bbc58745 Mon Sep 17 00:00:00 2001
From: Wangjue Yao <32279172+wjueyao@users.noreply.github.com>
Date: Fri, 19 Dec 2025 10:09:51 +0800
Subject: [PATCH 01/25] [None][feat] Support Mooncake transfer engine as a
 cache transceiver backend (#8309)

Signed-off-by: wjueyao <wyao123@terpmail.umd.edu>
Signed-off-by: Shunkang <182541032+Shunkangz@users.noreply.github.co>
Co-authored-by: Shunkang <182541032+Shunkangz@users.noreply.github.co>
---
 cpp/include/tensorrt_llm/executor/executor.h  |   3 +-
 .../tensorrt_llm/executor/transferAgent.h     |   8 +
 cpp/tensorrt_llm/CMakeLists.txt               |   9 +
 .../batch_manager/cacheTransceiver.cpp        |  13 +-
 cpp/tensorrt_llm/common/envUtils.cpp          |  23 +
 cpp/tensorrt_llm/common/envUtils.h            |   5 +
 cpp/tensorrt_llm/common/ipUtils.cpp           | 226 ++++++++
 cpp/tensorrt_llm/common/ipUtils.h             |  28 +
 cpp/tensorrt_llm/executor/CMakeLists.txt      |   1 +
 .../agent_utils/connection.cpp                |   4 +-
 .../agent_utils/connection.h                  |   2 +-
 .../mooncake_utils/CMakeLists.txt             |  40 ++
 .../mooncake_utils/transferAgent.cpp          | 546 ++++++++++++++++++
 .../mooncake_utils/transferAgent.h            | 165 ++++++
 .../nanobind/executor/executorConfig.cpp      |   3 +
 .../pybind/executor/executorConfig.cpp        |   3 +
 cpp/tests/unit_tests/executor/CMakeLists.txt  |  33 +-
 .../unit_tests/executor/agentCommTest.cpp     |  73 ++-
 .../unit_tests/executor/transferAgentTest.cpp | 233 ++++----
 .../multi_gpu/cacheTransceiverTest.cpp        |  30 +-
 jenkins/Build.groovy                          |   8 +-
 scripts/build_wheel.py                        |  18 +
 setup.py                                      |   6 +-
 .../_torch/pyexecutor/kv_cache_transceiver.py |   1 +
 tensorrt_llm/llmapi/llm_args.py               |   9 +-
 tests/integration/defs/cpp/test_multi_gpu.py  |   9 +-
 .../test_lists/test-db/l0_dgx_h100.yml        |   1 +
 27 files changed, 1353 insertions(+), 147 deletions(-)
 create mode 100644 cpp/tensorrt_llm/common/ipUtils.cpp
 create mode 100644 cpp/tensorrt_llm/common/ipUtils.h
 create mode 100644 cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt
 create mode 100644 cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp
 create mode 100644 cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index dda8f52cc8..787fa0bb7e 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1468,7 +1468,8 @@ public:
         DEFAULT = 0,
         MPI = 1,
         UCX = 2,
-        NIXL = 3
+        NIXL = 3,
+        MOONCAKE = 4
     };
     explicit CacheTransceiverConfig(std::optional<BackendType> backendType = std::nullopt,
         std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt,
diff --git a/cpp/include/tensorrt_llm/executor/transferAgent.h b/cpp/include/tensorrt_llm/executor/transferAgent.h
index ac469fcb40..5f4ff1f061 100644
--- a/cpp/include/tensorrt_llm/executor/transferAgent.h
+++ b/cpp/include/tensorrt_llm/executor/transferAgent.h
@@ -391,6 +391,14 @@ template <typename... Args>
             "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent");
         return func(std::forward<Args>(args)...);
     }
+    if (backend == "mooncake")
+    {
+        auto& loader = DynLibLoader::getInstance();
+        using CreateMooncakeFuncType = std::unique_ptr<BaseTransferAgent> (*)(BaseAgentConfig const*);
+        auto* func = loader.getFunctionPointer<CreateMooncakeFuncType>(
+            "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent");
+        return func(std::forward<Args>(args)...);
+    }
     TLLM_THROW("Unknown backend name.");
 }
 
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index a9e4a00729..76604ec229 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -159,6 +159,10 @@ if(NIXL_ROOT)
   set(NIXL_WRAPPER_TARGET tensorrt_llm_nixl_wrapper)
 endif()
 
+if(MOONCAKE_ROOT)
+  set(MOONCAKE_WRAPPER_TARGET tensorrt_llm_mooncake_wrapper)
+endif()
+
 add_subdirectory(executor)
 
 find_package(Threads REQUIRED)
@@ -272,6 +276,11 @@ if(TARGET ${NIXL_WRAPPER_TARGET})
   add_dependencies(${SHARED_TARGET} ${NIXL_WRAPPER_TARGET})
 endif()
 
+if(TARGET ${MOONCAKE_WRAPPER_TARGET})
+  target_link_libraries(${MOONCAKE_WRAPPER_TARGET} INTERFACE ${SHARED_TARGET})
+  add_dependencies(${SHARED_TARGET} ${MOONCAKE_WRAPPER_TARGET})
+endif()
+
 if(NOT WIN32)
   # Load libraries at $PREFIX/lib from
   # $PREFIX/lib/python3.12/site-packages/tensorrt_llm/libs
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
index bb253c969f..7e4c26bfd7 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@@ -81,6 +81,11 @@ std::unique_ptr<BaseCacheTransceiver> CacheTransceiverFactory::createCacheTransc
             backendType = executor::CacheTransceiverConfig::BackendType::NIXL;
             TLLM_LOG_INFO("Enable NIXL KV cache transport.");
         }
+        else if (common::getEnvUseMooncakeKvCache())
+        {
+            backendType = executor::CacheTransceiverConfig::BackendType::MOONCAKE;
+            TLLM_LOG_INFO("Enable MOONCAKE KV cache transport.");
+        }
         else if (common::getEnvUseMPIKvCache())
         {
             backendType = executor::CacheTransceiverConfig::BackendType::MPI;
@@ -203,9 +208,15 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
     else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL)
     {
         mManager = std::make_unique<tensorrt_llm::executor::kv_cache::AgentConnectionManager>(
-            mCacheTransBufferManagerPtrs, *mCacheState);
+            mCacheTransBufferManagerPtrs, *mCacheState, "nixl");
         TLLM_LOG_INFO("NIXL Connection Manager created");
     }
+    else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MOONCAKE)
+    {
+        mManager = std::make_unique<tensorrt_llm::executor::kv_cache::AgentConnectionManager>(
+            mCacheTransBufferManagerPtrs, *mCacheState, "mooncake");
+        TLLM_LOG_INFO("MOONCAKE Connection Manager created");
+    }
     else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI)
     {
         mMpiWorldComm = std::addressof(tensorrt_llm::mpi::MpiComm::world());
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
index fc85975acb..4a082a4ff3 100644
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -281,6 +281,12 @@ bool getEnvUseNixlKvCache()
     return useNixlKvCache;
 }
 
+bool getEnvUseMooncakeKvCache()
+{
+    static bool const useMooncakeKvCache = getBoolEnv("TRTLLM_USE_MOONCAKE_KVCACHE");
+    return useMooncakeKvCache;
+}
+
 bool getEnvUseRoundRobinBlockDistForCP()
 {
     static bool const useRoundRobinBlockDistForCP = getBoolEnv("TRTLLM_USE_ROUND_ROBIN_BLOCK_DIST_FOR_CP");
@@ -343,6 +349,23 @@ std::string getEnvNixlBackend()
     return nixlBackend;
 }
 
+std::string getEnvMooncakeInterface()
+{
+    static std::once_flag flag;
+    static std::string mooncakeInterface;
+
+    std::call_once(flag,
+        [&]()
+        {
+            char const* mooncake_interface = std::getenv("TRTLLM_MOONCAKE_INTERFACE");
+            if (mooncake_interface)
+            {
+                mooncakeInterface = mooncake_interface;
+            }
+        });
+    return mooncakeInterface;
+}
+
 bool getEnvDisaggLayerwise()
 {
     static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE");
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
index 8a3af2458d..f838f0e9ae 100644
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@@ -83,8 +83,11 @@ inline void launchWithPdlWhenEnabled(char const* name, KernelFn kernelFn, dim3 g
 bool getEnvUseUCXKvCache();
 
 bool getEnvUseMPIKvCache();
+
 bool getEnvUseNixlKvCache();
 
+bool getEnvUseMooncakeKvCache();
+
 bool getEnvUseRoundRobinBlockDistForCP();
 
 std::string getEnvUCXInterface();
@@ -93,6 +96,8 @@ std::string getEnvNixlInterface();
 
 std::string getEnvNixlBackend();
 
+std::string getEnvMooncakeInterface();
+
 bool getEnvDisaggLayerwise();
 
 bool getEnvParallelCacheSend();
diff --git a/cpp/tensorrt_llm/common/ipUtils.cpp b/cpp/tensorrt_llm/common/ipUtils.cpp
new file mode 100644
index 0000000000..e4e9767194
--- /dev/null
+++ b/cpp/tensorrt_llm/common/ipUtils.cpp
@@ -0,0 +1,226 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ipUtils.h"
+#include "tensorrt_llm/common/logger.h"
+
+#include <arpa/inet.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <string>
+#include <sys/socket.h>
+#include <unistd.h>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+
+std::string getLocalIpByNic(std::string const& interface, int rank)
+{
+    struct ifaddrs* ifaddr = nullptr;
+    if (getifaddrs(&ifaddr) == -1)
+    {
+        TLLM_LOG_ERROR(rank,
+            "getLocalIpByNic: Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is "
+            "set "
+            "correctly.");
+        return std::string{};
+    }
+
+    for (struct ifaddrs* ifa = ifaddr; ifa != nullptr; ifa = ifa->ifa_next)
+    {
+        if (ifa->ifa_addr == nullptr)
+        {
+            continue;
+        }
+
+        if (ifa->ifa_name == interface)
+        {
+            if (ifa->ifa_addr->sa_family == AF_INET)
+            {
+                char ip[INET_ADDRSTRLEN]{};
+                void* addr = &((reinterpret_cast<struct sockaddr_in*>(ifa->ifa_addr))->sin_addr);
+                if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "0.0.0.0") != 0)
+                {
+                    freeifaddrs(ifaddr);
+                    return std::string(ip);
+                }
+            }
+            else if (ifa->ifa_addr->sa_family == AF_INET6)
+            {
+                char ip[INET6_ADDRSTRLEN]{};
+                void* addr = &((reinterpret_cast<struct sockaddr_in6*>(ifa->ifa_addr))->sin6_addr);
+                if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0
+                    && std::strcmp(ip, "::1") != 0)
+                {
+                    freeifaddrs(ifaddr);
+                    return std::string(ip);
+                }
+            }
+        }
+    }
+
+    freeifaddrs(ifaddr);
+    TLLM_LOG_ERROR(
+        rank, "Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is set correctly.");
+    return std::string{};
+}
+
+std::string getLocalIpByHostname(int rank)
+{
+    char hostname[256]{};
+    if (gethostname(hostname, sizeof(hostname)) == -1)
+    {
+        TLLM_LOG_ERROR(rank, "getLocalIpByHostname: Can't get hostname");
+        return std::string{};
+    }
+
+    struct addrinfo hints = {};
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+    hints.ai_flags = AI_CANONNAME;
+
+    struct addrinfo* res = nullptr;
+    if (getaddrinfo(hostname, nullptr, &hints, &res) != 0)
+    {
+        TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get address info for hostname");
+        return std::string{};
+    }
+
+    for (struct addrinfo* p = res; p != nullptr; p = p->ai_next)
+    {
+
+        if (p->ai_family == AF_INET)
+        { // IPv4
+            char ip[INET_ADDRSTRLEN]{};
+            struct sockaddr_in* ipv4 = reinterpret_cast<struct sockaddr_in*>(p->ai_addr);
+            void* addr = &(ipv4->sin_addr);
+            if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "127.0.0.1") != 0
+                && std::strcmp(ip, "0.0.0.0") != 0)
+            {
+                freeaddrinfo(res);
+                return std::string(ip);
+            }
+        }
+        else if (p->ai_family == AF_INET6)
+        { // IPv6
+            char ip[INET6_ADDRSTRLEN]{};
+            struct sockaddr_in6* ipv6 = reinterpret_cast<struct sockaddr_in6*>(p->ai_addr);
+            void* addr = &(ipv6->sin6_addr);
+            if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0
+                && std::strcmp(ip, "::1") != 0)
+            {
+                freeaddrinfo(res);
+                return std::string(ip);
+            }
+        }
+    }
+
+    freeaddrinfo(res);
+    TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get local ip from hostname");
+    return std::string{};
+}
+
+std::string getLocalIpByRemoteOrHostName(int rank)
+{
+
+    // Try IPv4
+    struct sockaddr_in addr
+    {
+    };
+
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(80);
+    // using google's public dns server to get the local ip which can be accessed from remote
+    char const* dns_ip_v4 = "8.8.8.8";
+    inet_pton(AF_INET, dns_ip_v4, &addr.sin_addr);
+
+    int sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock != -1)
+    {
+        if (connect(sock, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)) != -1)
+        {
+            socklen_t addr_len = sizeof(addr);
+            if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &addr_len) != -1)
+            {
+                char ip[INET_ADDRSTRLEN]{};
+                inet_ntop(AF_INET, &addr.sin_addr, ip, sizeof(ip));
+                close(sock);
+                return std::string(ip);
+            }
+        }
+        close(sock);
+    }
+
+    // Try IPv6
+    struct sockaddr_in6 addr6
+    {
+    };
+
+    addr6.sin6_family = AF_INET6;
+    addr6.sin6_port = htons(80);
+    // using google's public dns server
+    char const* dns_ipv6 = "2001:4860:4860::8888";
+    inet_pton(AF_INET6, dns_ipv6, &addr6.sin6_addr);
+
+    sock = socket(AF_INET6, SOCK_DGRAM, 0);
+    if (sock != -1)
+    {
+        if (connect(sock, reinterpret_cast<struct sockaddr*>(&addr6), sizeof(addr6)) != -1)
+        {
+            socklen_t addr_len = sizeof(addr6);
+            if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr6), &addr_len) != -1)
+            {
+                char ip[INET6_ADDRSTRLEN]{};
+                inet_ntop(AF_INET6, &addr6.sin6_addr, ip, sizeof(ip));
+                close(sock);
+                return std::string(ip);
+            }
+        }
+        close(sock);
+    }
+
+    // Try hostname
+    return getLocalIpByHostname(rank);
+}
+
+std::string getLocalIp(std::string interface, int rank)
+{
+    std::string localIP = {};
+    if (!interface.empty())
+    {
+        localIP = getLocalIpByNic(interface, rank);
+    }
+    if (localIP.empty())
+    {
+        localIP = getLocalIpByRemoteOrHostName(rank);
+    }
+    // check whether the localIP is valid
+    if (localIP.empty())
+    {
+        TLLM_THROW("getLocalIp: Can't get local ip");
+    }
+    return localIP;
+}
+} // namespace common
+
+TRTLLM_NAMESPACE_END
diff --git a/cpp/tensorrt_llm/common/ipUtils.h b/cpp/tensorrt_llm/common/ipUtils.h
new file mode 100644
index 0000000000..9e8081683d
--- /dev/null
+++ b/cpp/tensorrt_llm/common/ipUtils.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/config.h"
+#include <string>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+std::string getLocalIp(std::string interface, int rank);
+} // namespace common
+
+TRTLLM_NAMESPACE_END
diff --git a/cpp/tensorrt_llm/executor/CMakeLists.txt b/cpp/tensorrt_llm/executor/CMakeLists.txt
index e0e91d4b99..6639b58275 100644
--- a/cpp/tensorrt_llm/executor/CMakeLists.txt
+++ b/cpp/tensorrt_llm/executor/CMakeLists.txt
@@ -91,3 +91,4 @@ target_compile_definitions(${EXECUTOR_STATIC_TARGET}
 
 add_subdirectory(cache_transmission/ucx_utils)
 add_subdirectory(cache_transmission/nixl_utils)
+add_subdirectory(cache_transmission/mooncake_utils)
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
index 9a3bb98a91..ee8e8e21b3 100644
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
@@ -236,7 +236,7 @@ bool AgentConnection::recvReadySignal(DataContext const& ctx) const
 
 AgentConnectionManager::AgentConnectionManager(
     std::vector<batch_manager::kv_cache_manager::CacheTransBufferManager*> cacheTransBufferManagers,
-    CacheState cacheState)
+    CacheState cacheState, std::string const& backendType)
     : mCacheState(std::move(cacheState))
     , mCacheTransBufferManagers(std::move(cacheTransBufferManagers))
     , mRegMemDescs(MemoryType::kVRAM, {})
@@ -247,7 +247,7 @@ AgentConnectionManager::AgentConnectionManager(
     mAgentName = genUniqueAgentName();
     // Create Agent
     BaseAgentConfig config{mAgentName, true};
-    m_Agent = makeTransferAgent("nixl", &config);
+    m_Agent = makeTransferAgent(backendType, &config);
     TLLM_CHECK(!mCacheTransBufferManagers.empty());
     std::vector<MemoryDesc> memDescs;
     for (auto* cacheTransBufferManager : mCacheTransBufferManagers)
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
index d5a780bf45..6b8bd875e4 100644
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
@@ -277,7 +277,7 @@ class AgentConnectionManager : public ConnectionManager
 public:
     AgentConnectionManager(
         std::vector<batch_manager::kv_cache_manager::CacheTransBufferManager*> cacheTransBufferManagers,
-        CacheState cacheState);
+        CacheState cacheState, std::string const& backendType);
     ~AgentConnectionManager();
     AgentConnection* recvConnect(DataContext const& ctx, void* data, size_t size) override;
     [[nodiscard]] std::vector<Connection const*> getConnections(CommState const& state) override;
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt
new file mode 100644
index 0000000000..105d3b93f1
--- /dev/null
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT
+# Source Code License Agreement
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this material and related documentation without an express
+# license agreement from NVIDIA CORPORATION or its affiliates is strictly
+# prohibited.
+
+# MOONCAKE is not supported on Rocky8 for now
+set(IS_ROCKY8 FALSE)
+if(EXISTS "/etc/redhat-release")
+  set(IS_ROCKY8 TRUE)
+endif()
+
+if(MOONCAKE_ROOT AND NOT IS_ROCKY8)
+  find_library(TRANSFER_ENGINE_LIB transfer_engine ${MOONCAKE_ROOT}/lib)
+  find_path(TRANSFER_ENGINE_INCLUDE_DIR transfer_engine_c.h
+            ${MOONCAKE_ROOT}/include)
+
+  message(STATUS "Find transfer engine results:")
+  message(STATUS "  TRANSFER_ENGINE_LIB = ${TRANSFER_ENGINE_LIB}")
+  message(
+    STATUS "  TRANSFER_ENGINE_INCLUDE_DIR = ${TRANSFER_ENGINE_INCLUDE_DIR}")
+
+  if(TRANSFER_ENGINE_LIB AND TRANSFER_ENGINE_INCLUDE_DIR)
+    set(MOONCAKE_WRAPPER_TARGET "tensorrt_llm_mooncake_wrapper")
+
+    add_library(${MOONCAKE_WRAPPER_TARGET} SHARED transferAgent.cpp)
+    target_compile_options(${MOONCAKE_WRAPPER_TARGET} PRIVATE -Wno-error)
+
+    target_include_directories(${MOONCAKE_WRAPPER_TARGET}
+                               PRIVATE ${TRANSFER_ENGINE_INCLUDE_DIR})
+
+    target_link_libraries(${MOONCAKE_WRAPPER_TARGET}
+                          PRIVATE ${TRANSFER_ENGINE_LIB} CUDA::cudart)
+  endif()
+endif()
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp
new file mode 100644
index 0000000000..eabbca98c3
--- /dev/null
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp
@@ -0,0 +1,546 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h"
+#include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/common/ipUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/executor/transferAgent.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <chrono>
+#include <dirent.h>
+#include <fcntl.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <thread>
+#include <unistd.h>
+
+namespace tensorrt_llm::executor::kv_cache
+{
+
+MooncakeTransferStatus::MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount)
+    : mEngine{engine}
+    , mBatchId{batchId}
+    , mRequestCount{requestCount}
+{
+    TLLM_CHECK(mEngine);
+}
+
+void MooncakeTransferStatus::wait() const
+{
+    while (!isCompleted())
+    {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+}
+
+[[nodiscard]] bool MooncakeTransferStatus::isCompleted() const
+{
+    if (mBatchFreed)
+    {
+        return true;
+    }
+
+    bool has_failed = false;
+    for (size_t index = 0; index < mRequestCount; ++index)
+    {
+        transfer_status_t status;
+        int rc = getTransferStatus(mEngine, mBatchId, index, &status);
+        if (rc || status.status == STATUS_FAILED)
+        {
+            has_failed = true;
+            if (rc)
+            {
+                TLLM_LOG_ERROR(
+                    "Failed to get transfer status for batch %lu, task %zu: error code %d", mBatchId, index, rc);
+            }
+            else
+            {
+                TLLM_LOG_ERROR("Transfer failed for batch %lu, task %zu: status %d", mBatchId, index, status.status);
+            }
+        }
+        else if (status.status == STATUS_PENDING || status.status == STATUS_WAITING)
+        {
+            TLLM_LOG_DEBUG("Transfer is pending for batch %lu, task %zu", mBatchId, index);
+            return false;
+        }
+    }
+    if (!has_failed)
+    {
+        // Each batchId has the batch size, and cannot process more requests
+        // than the batch size. So, free the batch id here to workaround the issue
+        // where the same batchId could be used to post multiple transfer.
+        freeBatchID(mEngine, mBatchId);
+        mBatchFreed = true;
+        TLLM_LOG_DEBUG("Batch ID %lu freed, future calls will return true directly", mBatchId);
+    }
+    // Currently, we cannot distinguish between failed and completed from return value.
+    TLLM_LOG_DEBUG("Transfer is completed for batch %lu", mBatchId);
+    return true;
+}
+
+const std::string MooncakeBase64Helper::STANDARD_CHARS
+    = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+      "abcdefghijklmnopqrstuvwxyz"
+      "0123456789+/";
+
+std::string MooncakeBase64Helper::encode(std::vector<uint8_t> const& data)
+{
+    return encodeInternal(data, STANDARD_CHARS);
+}
+
+std::string MooncakeBase64Helper::encode(std::string const& data)
+{
+    std::vector<uint8_t> vec(data.begin(), data.end());
+    return encode(vec);
+}
+
+std::vector<uint8_t> MooncakeBase64Helper::decode(std::string const& encoded)
+{
+    return decodeInternal(encoded, STANDARD_CHARS);
+}
+
+std::string MooncakeBase64Helper::decodeToString(std::string const& encoded)
+{
+    auto vec = decode(encoded);
+    return std::string(vec.begin(), vec.end());
+}
+
+std::string MooncakeBase64Helper::encodeInternal(std::vector<uint8_t> const& data, std::string const& chars)
+{
+    std::string encoded;
+    size_t i = 0;
+    size_t j = 0;
+    std::array<uint8_t, 3> charArray3{};
+    std::array<uint8_t, 4> charArray4{};
+    size_t dataLen = data.size();
+    uint8_t const* bytes = data.data();
+
+    while (dataLen--)
+    {
+        charArray3[i++] = *(bytes++);
+        if (i == 3)
+        {
+            charArray4[0] = (charArray3[0] & 0xfc) >> 2;
+            charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4);
+            charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6);
+            charArray4[3] = charArray3[2] & 0x3f;
+
+            for (i = 0; i < 4; i++)
+            {
+                encoded += chars[charArray4[i]];
+            }
+            i = 0;
+        }
+    }
+
+    if (i > 0)
+    {
+        for (j = i; j < 3; j++)
+        {
+            charArray3[j] = '\0';
+        }
+
+        charArray4[0] = (charArray3[0] & 0xfc) >> 2;
+        charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4);
+        charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6);
+        charArray4[3] = charArray3[2] & 0x3f;
+
+        for (j = 0; j < i + 1; j++)
+        {
+            encoded += chars[charArray4[j]];
+        }
+
+        while (i++ < 3)
+        {
+            encoded += '=';
+        }
+    }
+
+    return encoded;
+}
+
+std::vector<uint8_t> MooncakeBase64Helper::decodeInternal(std::string const& encoded, std::string const& chars)
+{
+    size_t encodedLen = encoded.size();
+    size_t i = 0;
+    size_t j = 0;
+    size_t in_ = 0;
+    std::array<uint8_t, 3> charArray3{};
+    std::array<uint8_t, 4> charArray4{};
+    std::vector<uint8_t> decoded;
+
+    std::string cleanEncoded;
+    for (char c : encoded)
+    {
+        if (!isWhitespace(c))
+        {
+            cleanEncoded += c;
+        }
+    }
+
+    encodedLen = cleanEncoded.size();
+
+    while (encodedLen-- && cleanEncoded[in_] != '=' && isBase64(cleanEncoded[in_], chars))
+    {
+        charArray4[i++] = cleanEncoded[in_];
+        in_++;
+        if (i == 4)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                charArray4[i] = chars.find(charArray4[i]);
+            }
+
+            charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
+            charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2);
+            charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3];
+
+            for (i = 0; i < 3; i++)
+            {
+                decoded.push_back(charArray3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i > 0)
+    {
+        for (j = i; j < 4; j++)
+        {
+            charArray4[j] = 0;
+        }
+
+        for (j = 0; j < 4; j++)
+        {
+            charArray4[j] = chars.find(charArray4[j]);
+        }
+
+        charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
+        charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2);
+        charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3];
+
+        for (j = 0; j < i - 1; j++)
+        {
+            decoded.push_back(charArray3[j]);
+        }
+    }
+
+    return decoded;
+}
+
+bool MooncakeBase64Helper::isBase64(uint8_t c, std::string const& chars)
+{
+    return (isalnum(c) || (c == chars[62]) || (c == chars[63]));
+}
+
+bool MooncakeBase64Helper::isWhitespace(uint8_t c)
+{
+    return (c == ' ' || c == '\n' || c == '\r' || c == '\t');
+}
+
+MooncakeTransferAgent::MooncakeTransferAgent(BaseAgentConfig const& config)
+{
+    mLocalAgentName = config.mName;
+    std::string segmentName = "127.0.0.1";
+
+    if (getenv("TLLM_MOONCAKE_IP_ADDR"))
+    {
+        segmentName = std::string(getenv("TLLM_MOONCAKE_IP_ADDR"));
+    }
+    else
+    {
+        auto ip = common::getLocalIp(common::getEnvMooncakeInterface(), mpi::MpiComm::session().getRank());
+        if (!ip.empty())
+            segmentName = ip;
+    }
+
+    mEngine = createTransferEngine("P2PHANDSHAKE", segmentName.c_str(), "", 0, true);
+}
+
+void MooncakeTransferAgent::registerMemory(RegisterDescs const& descs)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::registerMemory");
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    for (auto const& desc : descs.getDescs())
+    {
+        auto it = mMemRegInfo.find(desc.getAddr());
+        if (it != mMemRegInfo.end())
+        {
+            it->second->addRef();
+            continue;
+        }
+
+        int err = registerLocalMemory(mEngine, reinterpret_cast<void*>(desc.getAddr()), desc.getLen(), "*", 1);
+
+        TLLM_CHECK_WITH_INFO(err == 0, "registerLocalMemory failed, addr: %p, len: %lu",
+            reinterpret_cast<void*>(desc.getAddr()), desc.getLen());
+
+        auto mooncakeDesc = std::make_shared<MooncakeMemoryDesc>(desc);
+        mMemRegInfo[desc.getAddr()] = std::move(mooncakeDesc);
+    }
+}
+
+void MooncakeTransferAgent::deregisterMemory(RegisterDescs const& descs)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::deregisterMemory");
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    for (auto const& desc : descs.getDescs())
+    {
+        auto it = mMemRegInfo.find(desc.getAddr());
+        if (it != mMemRegInfo.end())
+        {
+            auto const& mooncakeDesc = it->second;
+            mooncakeDesc->releaseRef();
+            if (mooncakeDesc->getRefCount())
+                continue;
+
+            int err = unregisterLocalMemory(mEngine, reinterpret_cast<void*>(desc.getAddr()));
+
+            TLLM_CHECK_WITH_INFO(
+                err == 0, "unregisterLocalMemory failed, addr: %p", reinterpret_cast<void*>(desc.getAddr()));
+
+            mMemRegInfo.erase(desc.getAddr());
+        }
+    }
+}
+
+void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::loadRemoteAgent");
+
+    // Do the same thing as loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo)
+    loadRemoteAgent(name, std::move(agentDesc.getBackendAgentDesc()));
+}
+
+void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo)
+{
+    TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
+        "MooncakeTransferAgent::loadRemoteAgent loadRemoteAgent to %s remoteagent name: %s", connectionInfo.c_str(),
+        name.c_str());
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto segmentId = openSegment(mEngine, connectionInfo.c_str());
+
+    TLLM_CHECK_WITH_INFO(
+        segmentId >= 0, "loadRemoteAgent openSegment failed, connectionInfo: %s", connectionInfo.c_str());
+
+    mConnectedAgents[name].segmentId = segmentId;
+}
+
+void MooncakeTransferAgent::invalidateRemoteAgent(std::string const& name)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::invalidateRemoteAgent");
+}
+
+AgentDesc MooncakeTransferAgent::getLocalAgentDesc()
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalAgentDesc");
+
+    // Using connection info as agent desc
+    const static size_t kBufLen = 64;
+    char connectionInfo[kBufLen];
+
+    int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen);
+
+    TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalIpAndPort failed");
+
+    return AgentDesc{std::string(connectionInfo)};
+}
+
+ConnectionInfoType MooncakeTransferAgent::getLocalConnectionInfo()
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalConnectionInfo");
+
+    const static size_t kBufLen = 64;
+    char connectionInfo[kBufLen];
+
+    int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen);
+
+    TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalConnectionInfo failed");
+
+    return std::string(connectionInfo);
+}
+
+[[nodiscard]] std::unique_ptr<TransferStatus> MooncakeTransferAgent::submitTransferRequests(
+    TransferRequest const& request)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::submitTransferRequests");
+
+    bool hasNotif = false;
+    std::string syncMessage;
+
+    if (request.getSyncMessage().has_value())
+    {
+        hasNotif = true;
+        syncMessage = request.getSyncMessage().value();
+    }
+
+    const static size_t kMaxRequestCount = 1024;
+    uint64_t batchId = allocateBatchID(mEngine, kMaxRequestCount);
+
+    TLLM_CHECK_WITH_INFO(batchId != INVALID_BATCH, "allocateBatchID failed");
+
+    int segmentId;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        std::string remoteName = request.getRemoteName();
+
+        auto it = mConnectedAgents.find(remoteName);
+        if (it == mConnectedAgents.end())
+        {
+            std::string error = "Remote agent " + remoteName + "not found";
+            TLLM_THROW(error);
+        }
+
+        auto const& agentInfo = it->second;
+        segmentId = agentInfo.segmentId;
+    }
+
+    auto localDescs = request.getSrcDescs().getDescs();
+    auto remoteDescs = request.getDstDescs().getDescs();
+
+    TLLM_CHECK_WITH_INFO(localDescs.size() == remoteDescs.size(), "Number of local and remote memory must match");
+
+    size_t requestCount = localDescs.size();
+    std::vector<transfer_request_t> transferRequests(requestCount);
+
+    for (size_t index = 0; index < requestCount; ++index)
+    {
+        TLLM_CHECK_WITH_INFO(
+            localDescs[index].getLen() == remoteDescs[index].getLen(), "Length of local and remote memory must match");
+
+        transferRequests[index].opcode = (request.getOp() == TransferOp::kREAD) ? OPCODE_READ : OPCODE_WRITE;
+        transferRequests[index].source = reinterpret_cast<void*>(localDescs[index].getAddr());
+        transferRequests[index].target_offset = remoteDescs[index].getAddr();
+        transferRequests[index].length = localDescs[index].getLen();
+        transferRequests[index].target_id = segmentId;
+    }
+
+    int rc = 0;
+    if (hasNotif)
+    {
+        notify_msg_t notifyMsg;
+        notifyMsg.name = const_cast<char*>(mLocalAgentName.c_str());
+        notifyMsg.msg = const_cast<char*>(syncMessage.c_str());
+        rc = submitTransferWithNotify(mEngine, batchId, transferRequests.data(), requestCount, notifyMsg);
+    }
+    else
+    {
+        rc = submitTransfer(mEngine, batchId, transferRequests.data(), requestCount);
+    }
+
+    TLLM_CHECK_WITH_INFO(rc == 0, "submitTransfer failed with status: %d", rc);
+
+    return std::make_unique<MooncakeTransferStatus>(mEngine, batchId, requestCount);
+}
+
+void MooncakeTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage");
+    int segmentId;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        auto it = mConnectedAgents.find(name);
+
+        if (it == mConnectedAgents.end())
+        {
+            TLLM_LOG_WARNING("Remote agent %s not found", name.c_str());
+            return;
+        }
+
+        auto const& agentInfo = it->second;
+        segmentId = agentInfo.segmentId;
+    }
+
+    notify_msg_t notifyMsg;
+    notifyMsg.name = const_cast<char*>(mLocalAgentName.c_str());
+    std::string encoded = MooncakeBase64Helper::encode(syncMessage);
+    notifyMsg.msg = const_cast<char*>(encoded.c_str());
+
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage notifyMsg.name: %s, notifyMsg.msg: %s", notifyMsg.name,
+        notifyMsg.msg);
+
+    int ret = genNotifyInEngine(mEngine, segmentId, notifyMsg);
+
+    TLLM_CHECK_WITH_INFO(ret == 0, "genNotifyInEngine failed with status: %d", ret);
+}
+
+[[nodiscard]] std::unordered_map<std::string, std::vector<SyncMessage>> MooncakeTransferAgent::getNotifiedSyncMessages()
+{
+    std::unordered_map<std::string, std::vector<SyncMessage>> notifs;
+    int size = 0;
+
+    notify_msg_t* notifyMsgs = getNotifsFromEngine(mEngine, &size);
+
+    TLLM_CHECK_WITH_INFO(size >= 0, "getNotifsFromEngine returned negative size: %d", size);
+
+    for (int i = 0; i < size; i++)
+    {
+        if (notifyMsgs[i].msg == nullptr)
+        {
+            TLLM_LOG_WARNING("Message pointer is null for: %s", notifyMsgs[i].name);
+            continue;
+        }
+
+        std::string decoded = MooncakeBase64Helper::decodeToString(notifyMsgs[i].msg);
+        notifs[notifyMsgs[i].name].emplace_back(std::move(decoded));
+
+        TLLM_LOG_DEBUG("MooncakeTransferAgent::getNotifiedSyncMessages getNotifsFromEngine: %s, %s", notifyMsgs[i].name,
+            notifyMsgs[i].msg);
+    }
+
+    freeNotifsMsgBuf(notifyMsgs, size);
+    return notifs;
+}
+
+bool MooncakeTransferAgent::checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::checkRemoteDescs");
+    return true;
+}
+
+MooncakeTransferAgent::~MooncakeTransferAgent()
+{
+    destroyTransferEngine(mEngine);
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::~MooncakeTransferAgent");
+}
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
+#endif
+
+extern "C"
+{
+    std::unique_ptr<BaseTransferAgent> createMooncakeTransferAgent(BaseAgentConfig const* config)
+    {
+        TLLM_CHECK(config);
+        return std::make_unique<MooncakeTransferAgent>(*config);
+    }
+}
+
+} // namespace tensorrt_llm::executor::kv_cache
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h
new file mode 100644
index 0000000000..0aeeedeae1
--- /dev/null
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h
@@ -0,0 +1,165 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "tensorrt_llm/executor/transferAgent.h"
+#include "transfer_engine_c.h"
+
+namespace tensorrt_llm::executor::kv_cache
+{
+
+class MooncakeTransferStatus final : public TransferStatus
+{
+public:
+    MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount);
+
+    [[nodiscard]] bool isCompleted() const override;
+
+    void wait() const override;
+
+private:
+    transfer_engine_t mEngine;
+    uint64_t mBatchId;
+    size_t mRequestCount;
+    mutable bool mBatchFreed = false;
+};
+
+class MooncakeMemoryDesc
+{
+public:
+    MooncakeMemoryDesc(MemoryDesc desc)
+        : mDesc{std::move(desc)}
+        , mRefCnt{0}
+    {
+    }
+
+    MooncakeMemoryDesc(MooncakeMemoryDesc const& other)
+        : mDesc{other.mDesc}
+        , mRefCnt{0}
+    {
+    }
+
+    MooncakeMemoryDesc& operator=(MooncakeMemoryDesc const&) = delete;
+
+    ~MooncakeMemoryDesc() = default;
+
+    void addRef() noexcept
+    {
+        ++mRefCnt;
+    }
+
+    int releaseRef() noexcept
+    {
+        return --mRefCnt;
+    }
+
+    int getRefCount() const noexcept
+    {
+        return mRefCnt;
+    }
+
+    MemoryDesc const& getDesc() const noexcept
+    {
+        return mDesc;
+    }
+
+private:
+    MemoryDesc mDesc;
+    int mRefCnt;
+};
+
+class MooncakeBase64Helper
+{
+public:
+    static std::string encode(std::vector<uint8_t> const& data);
+    static std::string encode(std::string const& data);
+
+    static std::vector<uint8_t> decode(std::string const& encoded);
+    static std::string decodeToString(std::string const& encoded);
+
+private:
+    static const std::string STANDARD_CHARS;
+
+    static std::string encodeInternal(std::vector<uint8_t> const& data, std::string const& chars);
+    static std::vector<uint8_t> decodeInternal(std::string const& encoded, std::string const& chars);
+
+    static inline bool isBase64(uint8_t c, std::string const& chars);
+    static inline bool isWhitespace(uint8_t c);
+};
+
+class MooncakeTransferAgent final : public BaseTransferAgent
+{
+public:
+    MooncakeTransferAgent(BaseAgentConfig const& config);
+    ~MooncakeTransferAgent();
+
+    void registerMemory(RegisterDescs const& descs) override;
+
+    void deregisterMemory(RegisterDescs const& descs) override;
+
+    void loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) override;
+
+    void loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) override;
+
+    void invalidateRemoteAgent(std::string const& name) override;
+
+    AgentDesc getLocalAgentDesc() override;
+
+    ConnectionInfoType getLocalConnectionInfo() override;
+
+    [[nodiscard]] std::unique_ptr<TransferStatus> submitTransferRequests(TransferRequest const& request) override;
+
+    void notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) override;
+
+    [[nodiscard]] std::unordered_map<std::string, std::vector<SyncMessage>> getNotifiedSyncMessages() override;
+
+    bool checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) override;
+
+private:
+    struct AgentInfo
+    {
+        int segmentId;
+    };
+
+    mutable std::mutex mMutex;
+    transfer_engine_t mEngine;
+    std::unordered_map<uintptr_t, std::shared_ptr<MooncakeMemoryDesc>> mMemRegInfo;
+    std::unordered_map<std::string, AgentInfo> mConnectedAgents;
+    std::string mLocalAgentName;
+};
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
+#endif
+
+extern "C"
+{
+    [[nodiscard]] std::unique_ptr<BaseTransferAgent> createMooncakeTransferAgent(BaseAgentConfig const* config);
+}
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+} // namespace tensorrt_llm::executor::kv_cache
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
index bed5db70f7..051586b7fe 100644
--- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
@@ -449,6 +449,7 @@ void initConfigBindings(nb::module_& m)
         .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI)
         .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX)
         .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL)
+        .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE)
         .def("from_string",
             [](std::string const& str)
             {
@@ -460,6 +461,8 @@ void initConfigBindings(nb::module_& m)
                     return tle::CacheTransceiverConfig::BackendType::UCX;
                 if (str == "NIXL" || str == "nixl")
                     return tle::CacheTransceiverConfig::BackendType::NIXL;
+                if (str == "MOONCAKE" || str == "mooncake")
+                    return tle::CacheTransceiverConfig::BackendType::MOONCAKE;
                 throw std::runtime_error("Invalid backend type: " + str);
             });
 
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
index 7919423256..4fe20a6c66 100644
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -431,6 +431,7 @@ void initConfigBindings(pybind11::module_& m)
         .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI)
         .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX)
         .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL)
+        .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE)
         .def("from_string",
             [](std::string const& str)
             {
@@ -442,6 +443,8 @@ void initConfigBindings(pybind11::module_& m)
                     return tle::CacheTransceiverConfig::BackendType::UCX;
                 if (str == "NIXL" || str == "nixl")
                     return tle::CacheTransceiverConfig::BackendType::NIXL;
+                if (str == "MOONCAKE" || str == "mooncake")
+                    return tle::CacheTransceiverConfig::BackendType::MOONCAKE;
                 throw std::runtime_error("Invalid backend type: " + str);
             });
 
diff --git a/cpp/tests/unit_tests/executor/CMakeLists.txt b/cpp/tests/unit_tests/executor/CMakeLists.txt
index de3a694d21..069363c5ed 100644
--- a/cpp/tests/unit_tests/executor/CMakeLists.txt
+++ b/cpp/tests/unit_tests/executor/CMakeLists.txt
@@ -38,10 +38,31 @@ add_gtest(ucxCommTest ucxCommTest.cpp)
 target_link_libraries(ucxCommTest PRIVATE ${Python3_LIBRARIES})
 target_link_libraries(serializeUtilsTest PRIVATE ${Python3_LIBRARIES})
 
-if(NIXL_ROOT)
-  add_gtest(transferAgentTest transferAgentTest.cpp)
-  add_gtest(agentCommTest agentCommTest.cpp)
-  target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper)
-  target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper
-                                              ${Python3_LIBRARIES})
+# Skip MOONCAKE related tests on Rocky8
+set(IS_ROCKY8 FALSE)
+if(EXISTS "/etc/redhat-release")
+  set(IS_ROCKY8 TRUE)
+endif()
+
+if(NIXL_ROOT OR (MOONCAKE_ROOT AND NOT IS_ROCKY8))
+  add_gtest(agentCommTest agentCommTest.cpp)
+  add_gtest(transferAgentTest transferAgentTest.cpp)
+
+  if(NIXL_ROOT)
+    target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper)
+    target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper
+                                                ${Python3_LIBRARIES})
+    target_compile_definitions(transferAgentTest PRIVATE TEST_NIXL_BACKEND=1)
+    target_compile_definitions(agentCommTest PRIVATE TEST_NIXL_BACKEND=1)
+  endif()
+
+  if(MOONCAKE_ROOT)
+    target_link_libraries(transferAgentTest
+                          PRIVATE tensorrt_llm_mooncake_wrapper)
+    target_link_libraries(agentCommTest PRIVATE tensorrt_llm_mooncake_wrapper
+                                                ${Python3_LIBRARIES})
+    target_compile_definitions(transferAgentTest
+                               PRIVATE TEST_MOONCAKE_BACKEND=1)
+    target_compile_definitions(agentCommTest PRIVATE TEST_MOONCAKE_BACKEND=1)
+  endif()
 endif()
diff --git a/cpp/tests/unit_tests/executor/agentCommTest.cpp b/cpp/tests/unit_tests/executor/agentCommTest.cpp
index ccd54ab926..025a3a8bc6 100644
--- a/cpp/tests/unit_tests/executor/agentCommTest.cpp
+++ b/cpp/tests/unit_tests/executor/agentCommTest.cpp
@@ -22,22 +22,54 @@ using namespace tensorrt_llm::batch_manager::kv_cache_manager;
 using namespace tensorrt_llm::runtime;
 using namespace tensorrt_llm::executor::kv_cache;
 
-bool needSkipTest(std::string& skipReason)
+std::vector<std::string> getAvailableBackends()
+{
+    std::vector<std::string> backends;
+
+#ifdef TEST_NIXL_BACKEND
+    backends.push_back("nixl");
+#endif
+
+#ifdef TEST_MOONCAKE_BACKEND
+    backends.push_back("mooncake");
+#endif
+
+    return backends;
+}
+
+bool needSkipTest(std::string const& backend, std::string& skipReason)
 {
     bool skip = false;
     try
     {
         auto& loader = tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance();
 
-        using CreateNixlFuncType = std::unique_ptr<tensorrt_llm::executor::kv_cache::BaseTransferAgent> (*)(
-            tensorrt_llm::executor::kv_cache::BaseAgentConfig const*);
-        auto* func = loader.getFunctionPointer<CreateNixlFuncType>(
-            "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent");
+        if (backend == "nixl")
+        {
+            using CreateNixlFuncType = std::unique_ptr<tensorrt_llm::executor::kv_cache::BaseTransferAgent> (*)(
+                tensorrt_llm::executor::kv_cache::BaseAgentConfig const*);
+            auto* func = loader.getFunctionPointer<CreateNixlFuncType>(
+                "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent");
+        }
+        else if (backend == "mooncake")
+        {
+            using CreateMooncakeFuncType = std::unique_ptr<tensorrt_llm::executor::kv_cache::BaseTransferAgent> (*)(
+                tensorrt_llm::executor::kv_cache::BaseAgentConfig const*);
+            auto* func = loader.getFunctionPointer<CreateMooncakeFuncType>(
+                "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent");
+        }
+        else
+        {
+            skip = true;
+            skipReason = "Unknown backend: " + backend;
+        }
     }
     catch (std::exception const& e)
     {
         std::string error = e.what();
-        if (error.find("libtensorrt_llm_nixl_wrapper.so") != std::string::npos)
+        std::string libName
+            = (backend == "nixl") ? "libtensorrt_llm_nixl_wrapper.so" : "libtensorrt_llm_mooncake_wrapper.so";
+        if (error.find(libName) != std::string::npos)
         {
             skip = true;
             skipReason = error;
@@ -46,17 +78,26 @@ bool needSkipTest(std::string& skipReason)
     return skip;
 }
 
-class AgentCommTest : public ::testing::Test
+class AgentCommTest : public ::testing::TestWithParam<std::string>
 {
 protected:
     void SetUp() override
     {
+        backend = GetParam();
         std::string skipReason;
-        if (needSkipTest(skipReason))
+        if (needSkipTest(backend, skipReason))
         {
             GTEST_SKIP() << skipReason;
         }
-        setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1);
+
+        if (backend == "nixl")
+        {
+            setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1);
+        }
+        else if (backend == "mooncake")
+        {
+            setenv("TRTLLM_USE_MOONCAKE_KVCACHE", "1", 1);
+        }
 
         auto constexpr numLayers = 8;
         auto constexpr numHeads = 16;
@@ -106,15 +147,16 @@ protected:
         mCacheState.reset();
     }
 
+    std::string backend;
     std::unique_ptr<CacheTransBufferManager> mTransBufferManager;
     std::unique_ptr<KVCacheManager> mCacheManager;
     std::unique_ptr<CacheState> mCacheState;
 };
 
-TEST_F(AgentCommTest, AgentConnectionManagerBasic)
+TEST_P(AgentCommTest, AgentConnectionManagerBasic)
 {
     std::vector<CacheTransBufferManager*> bufferManagers{mTransBufferManager.get()};
-    auto connectionManager = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState);
+    auto connectionManager = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState, backend);
     ASSERT_TRUE(connectionManager != nullptr);
     ASSERT_EQ(connectionManager->getCacheTransBufferManagers().size(), bufferManagers.size());
     ASSERT_TRUE(connectionManager->getCacheTransBufferManagers().front() != nullptr);
@@ -126,11 +168,11 @@ TEST_F(AgentCommTest, AgentConnectionManagerBasic)
     ASSERT_EQ(commState.getAgentState().size(), 1);
 }
 
-TEST_F(AgentCommTest, AgentConnectionManagerConnect)
+TEST_P(AgentCommTest, AgentConnectionManagerConnect)
 {
     std::vector<CacheTransBufferManager*> bufferManagers{mTransBufferManager.get()};
-    auto connectionManager0 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState);
-    auto connectionManager1 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState);
+    auto connectionManager0 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState, backend);
+    auto connectionManager1 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState, backend);
     auto agentName0 = connectionManager0->getAgentName();
     auto agentName1 = connectionManager1->getAgentName();
     ASSERT_TRUE(!agentName0.empty());
@@ -189,3 +231,6 @@ TEST_F(AgentCommTest, AgentConnectionManagerConnect)
     }
     TLLM_LOG_INFO("after finish");
 }
+
+INSTANTIATE_TEST_SUITE_P(AvailableBackends, AgentCommTest, ::testing::ValuesIn(getAvailableBackends()),
+    [](::testing::TestParamInfo<AgentCommTest::ParamType> const& info) { return info.param; });
diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
index 0f21449f30..7218611a0e 100644
--- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp
+++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
@@ -22,11 +22,27 @@
 #include <gtest/gtest.h>
 
 #include <filesystem>
+#include <vector>
 
 namespace fs = std::filesystem;
 
 using namespace tensorrt_llm::executor::kv_cache;
 
+std::vector<std::string> getAvailableBackends()
+{
+    std::vector<std::string> backends;
+
+#ifdef TEST_NIXL_BACKEND
+    backends.push_back("nixl");
+#endif
+
+#ifdef TEST_MOONCAKE_BACKEND
+    backends.push_back("mooncake");
+#endif
+
+    return backends;
+}
+
 class RegisteredHostMemory
 {
 public:
@@ -54,100 +70,105 @@ private:
     BaseTransferAgent* mAgentPtr{};
 };
 
-class TransferAgentTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init)
+class TransferAgentTest : public ::testing::TestWithParam<std::string> // NOLINT(cppcoreguidelines-pro-type-member-init)
 {
 public:
-    void SetUp() override {}
+    void SetUp() override
+    {
+        backend = GetParam();
+    }
 
     void TearDown() override {}
 
     [[nodiscard]] std::unique_ptr<BaseTransferAgent> makeTransferAgent(BaseAgentConfig const& config)
     {
-        return tensorrt_llm::executor::kv_cache::makeTransferAgent("nixl", &config);
+        return tensorrt_llm::executor::kv_cache::makeTransferAgent(backend, &config);
     }
+
+    std::string backend;
 };
 
-TEST_F(TransferAgentTest, Basic)
+TEST_P(TransferAgentTest, Basic)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
 
-    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get());
-    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get());
+    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
-        // wait for regMem is unpacked by nixlAgent0
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        // wait for regMem is unpacked by xferAgent0
     } while (!checked);
 
     TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
     status->wait();
 
     TLLM_CHECK(memory0 == memory1);
 
-    nixlAgent0->invalidateRemoteAgent(agent1);
+    xferAgent0->invalidateRemoteAgent(agent1);
 }
 
-TEST_F(TransferAgentTest, Basic2)
+TEST_P(TransferAgentTest, Basic2)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
 
-    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get());
-    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get());
+    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
     } while (!checked);
 
     TransferRequest readReq{TransferOp::kREAD, regMem0.getDescs(), regMem1.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(readReq);
+    auto status = xferAgent0->submitTransferRequests(readReq);
     status->wait();
 
     TLLM_CHECK(memory0 == memory1);
 
-    nixlAgent0->invalidateRemoteAgent(agent1);
+    xferAgent0->invalidateRemoteAgent(agent1);
 }
 
-TEST_F(TransferAgentTest, DeviceMemory)
+TEST_P(TransferAgentTest, DeviceMemory)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
     char* dev_ptr0;
     char* dev_ptr1;
     size_t size = 100;
@@ -159,20 +180,20 @@ TEST_F(TransferAgentTest, DeviceMemory)
     cudaMemcpy(dev_ptr0, memory0.data(), size, cudaMemcpyHostToDevice);
     cudaMemcpy(dev_ptr1, memory1.data(), size, cudaMemcpyHostToDevice);
     RegisteredHostMemory regMem0(
-        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, nixlAgent0.get());
+        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, xferAgent0.get());
     RegisteredHostMemory regMem1(
-        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, nixlAgent1.get());
+        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
     } while (!checked);
     TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
     status->wait();
 
     cudaMemcpy(memory0.data(), dev_ptr0, size, cudaMemcpyDeviceToHost);
@@ -181,98 +202,99 @@ TEST_F(TransferAgentTest, DeviceMemory)
     TLLM_CHECK(memory0 == memory1);
     TLLM_CUDA_CHECK(cudaFree(dev_ptr0));
     TLLM_CUDA_CHECK(cudaFree(dev_ptr1));
-    nixlAgent0->invalidateRemoteAgent(agent1);
+    xferAgent0->invalidateRemoteAgent(agent1);
 }
 
-TEST_F(TransferAgentTest, Connect)
+TEST_P(TransferAgentTest, Connect)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"}, agent2{"agent2"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true}, config2{agent2, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
-    auto nixlAgent2 = makeTransferAgent(config2);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
+    auto xferAgent2 = makeTransferAgent(config2);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
     MemoryDescs memDescs0{MemoryType::kDRAM, {MemoryDesc{memory0}}};
     MemoryDescs memDescs1{MemoryType::kDRAM, {MemoryDesc{memory1}}};
 
-    nixlAgent0->registerMemory(memDescs0);
-    nixlAgent1->registerMemory(memDescs1);
-    nixlAgent2->registerMemory(memDescs0);
+    xferAgent0->registerMemory(memDescs0);
+    xferAgent1->registerMemory(memDescs1);
+    xferAgent2->registerMemory(memDescs0);
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, memDescs1);
+        checked = xferAgent0->checkRemoteDescs(agent1, memDescs1);
     } while (!checked);
     TransferRequest writeReq{TransferOp::kWRITE, memDescs0, memDescs1, agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
     status->wait();
 
     TLLM_CHECK(memory0 == memory1);
-    nixlAgent2->loadRemoteAgent(agent1, connectionInfo);
+    xferAgent2->loadRemoteAgent(agent1, connectionInfo);
     checked = false;
     do
     {
-        checked = nixlAgent2->checkRemoteDescs(agent1, memDescs1);
+        checked = xferAgent2->checkRemoteDescs(agent1, memDescs1);
     } while (!checked);
     TransferRequest writeReq2{TransferOp::kWRITE, memDescs0, memDescs1, agent1};
-    auto status2 = nixlAgent2->submitTransferRequests(writeReq2);
+    auto status2 = xferAgent2->submitTransferRequests(writeReq2);
     status2->wait();
     TLLM_CHECK(memory0 == memory1);
-    nixlAgent0->invalidateRemoteAgent(agent1);
-    nixlAgent2->invalidateRemoteAgent(agent1);
-    nixlAgent0->deregisterMemory(memDescs0);
-    nixlAgent1->deregisterMemory(memDescs1);
-    nixlAgent2->deregisterMemory(memDescs0);
+    xferAgent0->invalidateRemoteAgent(agent1);
+    xferAgent2->invalidateRemoteAgent(agent1);
+    xferAgent0->deregisterMemory(memDescs0);
+    xferAgent1->deregisterMemory(memDescs1);
+    xferAgent2->deregisterMemory(memDescs0);
 }
 
-TEST_F(TransferAgentTest, SyncMessage)
+TEST_P(TransferAgentTest, SyncMessage)
 {
     constexpr std::size_t MAX_QUERY_TIMES = std::numeric_limits<size_t>::max();
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
 
-    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get());
-    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent0.get());
+    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent0.get());
 
-    RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent1.get());
-    RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get());
+    RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent1.get());
+    RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs());
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem3.getDescs());
     } while (!checked);
     auto syncMessage = std::string("agent_sync_message");
     TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
-    nixlAgent0->notifySyncMessage(agent1, syncMessage);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
+    xferAgent0->notifySyncMessage(agent1, syncMessage);
 
-    auto notif = nixlAgent1->getNotifiedSyncMessages();
+    auto notif = xferAgent1->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif.size() == 0; i++)
     {
-        notif = nixlAgent1->getNotifiedSyncMessages();
+        notif = xferAgent1->getNotifiedSyncMessages();
     }
+    status->wait();
     TLLM_CHECK(status->isCompleted());
     TLLM_CHECK(notif.size() == 1);
     TLLM_CHECK(notif[agent0].size() == 1);
@@ -281,25 +303,25 @@ TEST_F(TransferAgentTest, SyncMessage)
     TLLM_CHECK(memory0 == memory1);
 
     std::string syncMessage2 = "two_agent_sync_message";
-    nixlAgent0->notifySyncMessage(agent1, syncMessage2);
-    auto notif2 = nixlAgent1->getNotifiedSyncMessages();
+    xferAgent0->notifySyncMessage(agent1, syncMessage2);
+    auto notif2 = xferAgent1->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif2.size() == 0; i++)
     {
-        notif2 = nixlAgent1->getNotifiedSyncMessages();
+        notif2 = xferAgent1->getNotifiedSyncMessages();
     }
     TLLM_CHECK(notif2.size() == 1);
     TLLM_CHECK(notif2[agent0].size() == 1);
     TLLM_CHECK(notif2[agent0][0] == syncMessage2);
 
-    // nixlAgent1->loadRemoteAgent(agent0);
-    auto connectionInfo2 = nixlAgent0->getLocalConnectionInfo();
-    nixlAgent1->loadRemoteAgent(agent0, connectionInfo2);
+    // xferAgent1->loadRemoteAgent(agent0);
+    auto connectionInfo2 = xferAgent0->getLocalConnectionInfo();
+    xferAgent1->loadRemoteAgent(agent0, connectionInfo2);
     std::string syncMessage3 = "three_agent_sync_message";
-    nixlAgent1->notifySyncMessage(agent0, syncMessage3);
-    auto notif3 = nixlAgent0->getNotifiedSyncMessages();
+    xferAgent1->notifySyncMessage(agent0, syncMessage3);
+    auto notif3 = xferAgent0->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif3.size() == 0; i++)
     {
-        notif3 = nixlAgent0->getNotifiedSyncMessages();
+        notif3 = xferAgent0->getNotifiedSyncMessages();
     }
     TLLM_CHECK(notif3.size() == 1);
     TLLM_CHECK(notif3[agent1].size() == 1);
@@ -308,19 +330,20 @@ TEST_F(TransferAgentTest, SyncMessage)
     bool checked2 = false;
     do
     {
-        checked2 = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        checked2 = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
     } while (!checked2);
 
     std::string syncMessage4 = "four_agent_sync_message";
     TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0};
-    auto status1 = nixlAgent1->submitTransferRequests(writeReq1);
-    nixlAgent1->notifySyncMessage(agent0, syncMessage4);
+    auto status1 = xferAgent1->submitTransferRequests(writeReq1);
+    xferAgent1->notifySyncMessage(agent0, syncMessage4);
 
-    auto notif4 = nixlAgent0->getNotifiedSyncMessages();
+    auto notif4 = xferAgent0->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++)
     {
-        notif4 = nixlAgent0->getNotifiedSyncMessages();
+        notif4 = xferAgent0->getNotifiedSyncMessages();
     }
+    status1->wait();
     TLLM_CHECK(status1->isCompleted());
     TLLM_CHECK(notif4.size() == 1);
     TLLM_CHECK(notif4[agent1].size() == 1);
@@ -335,11 +358,11 @@ TEST_F(TransferAgentTest, SyncMessage)
     std::stringstream ss;
     Serialization::serialize(state, ss);
     std::string serializedState = ss.str();
-    nixlAgent0->notifySyncMessage(agent1, serializedState);
-    auto notif5 = nixlAgent1->getNotifiedSyncMessages();
+    xferAgent0->notifySyncMessage(agent1, serializedState);
+    auto notif5 = xferAgent1->getNotifiedSyncMessages();
     for (size_t i = 0; i < MAX_QUERY_TIMES && notif5.size() == 0; i++)
     {
-        notif5 = nixlAgent1->getNotifiedSyncMessages();
+        notif5 = xferAgent1->getNotifiedSyncMessages();
     }
     TLLM_CHECK(notif5.size() == 1);
     TLLM_CHECK(notif5[agent0].size() == 1);
@@ -348,10 +371,16 @@ TEST_F(TransferAgentTest, SyncMessage)
     auto state2 = Serialization::deserializeCommState(ss2);
     TLLM_CHECK(state2 == state);
 
-    nixlAgent0->invalidateRemoteAgent(agent1);
-    nixlAgent1->invalidateRemoteAgent(agent0);
+    xferAgent0->invalidateRemoteAgent(agent1);
+    xferAgent1->invalidateRemoteAgent(agent0);
 }
 
+INSTANTIATE_TEST_SUITE_P(AvailableBackends, TransferAgentTest, ::testing::ValuesIn(getAvailableBackends()),
+    [](::testing::TestParamInfo<TransferAgentTest::ParamType> const& info) { return info.param; });
+
+// Skip LoopbackAgentTest for mooncake backend for now
+#ifdef TEST_NIXL_BACKEND
+
 class LoopbackAgentTest : public ::testing::Test,
                           public ::testing::WithParamInterface<bool> // NOLINT(cppcoreguidelines-pro-type-member-init)
 {
@@ -466,3 +495,5 @@ TEST_P(LoopbackAgentTest, GpuToFile)
 }
 
 INSTANTIATE_TEST_SUITE_P(, LoopbackAgentTest, ::testing::Values(true, false));
+
+#endif // TEST_NIXL_BACKEND
diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
index 17ca989eee..41dd8e7a92 100644
--- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
+++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
@@ -46,6 +46,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <filesystem>
 #include <memory>
 #include <random>
 #include <tensorrt_llm/batch_manager/cacheTransBuffer.h>
@@ -713,7 +714,7 @@ protected:
             return;
         }
         else if (tensorrt_llm::common::getEnvUseMPIKvCache() || tensorrt_llm::common::getEnvUseUCXKvCache()
-            || tensorrt_llm::common::getEnvUseNixlKvCache())
+            || tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())
         {
             int maxNumTokens = 2048;
             mCacheTransBufferManagers.clear();
@@ -729,7 +730,15 @@ protected:
             }
             bool isUcx = tensorrt_llm::common::getEnvUseUCXKvCache();
             bool isNixl = tensorrt_llm::common::getEnvUseNixlKvCache();
-            TLLM_LOG_INFO("Enable %s KV cache transport.", isUcx ? "UCX" : isNixl ? "NIXL" : "MPI");
+            bool isMooncake = tensorrt_llm::common::getEnvUseMooncakeKvCache();
+            // Skip tests for MOONCAKE when on Rocky8
+            bool isRocky8 = std::filesystem::exists("/etc/redhat-release");
+            isMooncake = isMooncake && !isRocky8;
+            TLLM_LOG_INFO("Enable %s KV cache transport.",
+                isUcx            ? "UCX"
+                    : isNixl     ? "NIXL"
+                    : isMooncake ? "MOONCAKE"
+                                 : "MPI");
 
             if (isUcx)
             {
@@ -756,7 +765,12 @@ protected:
                 setenv("TRTLLM_NIXL_PORT", std::to_string(port).c_str(), 1);
 
                 mConnectionManager
-                    = std::make_unique<texec::kv_cache::AgentConnectionManager>(bufferManagers, *mCacheState);
+                    = std::make_unique<texec::kv_cache::AgentConnectionManager>(bufferManagers, *mCacheState, "nixl");
+            }
+            else if (isMooncake)
+            {
+                mConnectionManager = std::make_unique<texec::kv_cache::AgentConnectionManager>(
+                    bufferManagers, *mCacheState, "mooncake");
             }
             else
             {
@@ -783,7 +797,7 @@ protected:
             std::vector<int> contextRankVec(mContextRankSize);
             std::iota(contextRankVec.begin(), contextRankVec.end(), 0);
 
-            if (isUcx || isNixl)
+            if (isUcx || isNixl || isMooncake)
             {
                 auto commState = mConnectionManager->getCommState();
                 namespace su = tensorrt_llm::executor::serialize_utils;
@@ -1286,9 +1300,9 @@ TEST_P(AsymmetricalCacheTest, TestCase)
     int indexerDimPerHead = std::get<17>(param);
     int indexerKCacheQuantBlockSize = std::get<18>(param);
 
-    if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache())
+    if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache()))
     {
-        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP.";
+        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP.";
     }
     std::vector<int> lenList = {30, 10, 60, 80};
     if (genCp > 1)
@@ -1410,9 +1424,9 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase)
     int indexerDimPerHead = std::get<17>(param);
     int indexerKCacheQuantBlockSize = std::get<18>(param);
 
-    if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache())
+    if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache()))
     {
-        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP.";
+        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP.";
     }
     setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP);
 
diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index bfa3af4412..261c0a6d3a 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -60,12 +60,12 @@ def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
   // cmake-vars cannot be empty, so passing (default) multi-device configuration.
   (CONFIG_LINUX_X86_64_VANILLA) : [
-    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
+    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
     (TARNAME) : "TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
   ],
   (CONFIG_LINUX_X86_64_PYBIND) : [
-    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
+    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
     (TARNAME) : "pybind-TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
   ],
@@ -80,13 +80,13 @@ def BUILD_CONFIGS = [
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
   ],
   (CONFIG_LINUX_AARCH64): [
-    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl",
+    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
     (TARNAME) : "TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
     (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
   ],
   (CONFIG_LINUX_AARCH64_PYBIND): [
-    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl",
+    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
     (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
     (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 03aae58617..ed2d1b88fd 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -458,6 +458,7 @@ def main(*,
          trt_root: str = '/usr/local/tensorrt',
          nccl_root: str = None,
          nixl_root: str = None,
+         mooncake_root: str = None,
          internal_cutlass_kernels_root: str = None,
          clean: bool = False,
          clean_wheel: bool = False,
@@ -559,6 +560,11 @@ def main(*,
     if nixl_root is not None:
         cmake_def_args.append(f"-DNIXL_ROOT={nixl_root}")
 
+    if mooncake_root is not None:
+        if on_windows:
+            raise RuntimeError("Mooncake is not supported on Windows.")
+        cmake_def_args.append(f"-DMOONCAKE_ROOT={mooncake_root}")
+
     build_dir = get_build_dir(build_dir, build_type)
     first_build = not Path(build_dir, "CMakeFiles").exists()
 
@@ -819,6 +825,14 @@ def main(*,
                 build_run(
                     f"find {nixl_dir} -type f -name '*.so*' -exec patchelf --set-rpath \'$ORIGIN:$ORIGIN/plugins:$ORIGIN/../:$ORIGIN/../ucx/:$ORIGIN/../../ucx/\' {{}} \\;"
                 )
+        if os.path.exists(
+                build_dir /
+                "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so"
+        ):
+            install_file(
+                build_dir /
+                "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so",
+                lib_dir / "libtensorrt_llm_mooncake_wrapper.so")
         install_file(
             build_dir /
             "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_0.so",
@@ -1041,6 +1055,10 @@ def add_arguments(parser: ArgumentParser):
                         help="Directory containing NCCL headers and libraries")
     parser.add_argument("--nixl_root",
                         help="Directory containing NIXL headers and libraries")
+    parser.add_argument(
+        "--mooncake_root",
+        help=
+        "Directory containing Mooncake transfer engine headers and libraries")
     parser.add_argument(
         "--internal-cutlass-kernels-root",
         default="",
diff --git a/setup.py b/setup.py
index 5c61029aad..094ca01467 100644
--- a/setup.py
+++ b/setup.py
@@ -114,9 +114,9 @@ else:
         'libs/libnvinfer_plugin_tensorrt_llm.so',
         'libs/libtensorrt_llm_ucx_wrapper.so', 'libs/libdecoder_attention_0.so',
         'libs/libtensorrt_llm_nixl_wrapper.so', 'libs/nixl/**/*',
-        'libs/ucx/**/*', 'libs/libpg_utils.so',
-        'libs/libdecoder_attention_1.so', 'libs/nvshmem/License.txt',
-        'libs/nvshmem/nvshmem_bootstrap_uid.so.3',
+        'libs/libtensorrt_llm_mooncake_wrapper.so', 'libs/ucx/**/*',
+        'libs/libpg_utils.so', 'libs/libdecoder_attention_1.so',
+        'libs/nvshmem/License.txt', 'libs/nvshmem/nvshmem_bootstrap_uid.so.3',
         'libs/nvshmem/nvshmem_transport_ibgda.so.103', 'bindings.*.so',
         'deep_ep/LICENSE', 'deep_ep/*.py', 'deep_ep_cpp_tllm.*.so',
         "include/**/*", 'deep_gemm/LICENSE', 'deep_gemm/include/**/*',
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
index 73ee3f5c7b..5616be7708 100644
--- a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
+++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -42,6 +42,7 @@ def create_kv_cache_transceiver(
         cache_transceiver_config.backend = "NIXL"
         # Ordered by priority
         env_vars = [("TRTLLM_USE_UCX_KVCACHE", "UCX"),
+                    ("TRTLLM_USE_MOONCAKE_KVCACHE", "MOONCAKE"),
                     ("TRTLLM_USE_MPI_KVCACHE", "MPI")]
         for env_var, be_type in env_vars:
             if getenv(env_var) == "1":
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index c2d5f23f50..2f22f49340 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1739,10 +1739,11 @@ class CacheTransceiverConfig(StrictBaseModel, PybindMirror):
     Configuration for the cache transceiver.
     """
 
-    backend: Optional[Literal["DEFAULT", "UCX", "NIXL", "MPI"]] = Field(
-        default=None,
-        description=
-        "The communication backend type to use for the cache transceiver.")
+    backend: Optional[Literal[
+        "DEFAULT", "UCX", "NIXL", "MOONCAKE", "MPI"]] = Field(
+            default=None,
+            description=
+            "The communication backend type to use for the cache transceiver.")
 
     max_tokens_in_buffer: Optional[int] = Field(
         default=None,
diff --git a/tests/integration/defs/cpp/test_multi_gpu.py b/tests/integration/defs/cpp/test_multi_gpu.py
index 7cf92efaad..1124178ccc 100644
--- a/tests/integration/defs/cpp/test_multi_gpu.py
+++ b/tests/integration/defs/cpp/test_multi_gpu.py
@@ -25,6 +25,7 @@ class KVCacheType(Enum):
     MPI = auto()
     UCX = auto()
     NIXL = auto()
+    MOONCAKE = auto()
 
 
 def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False):
@@ -37,6 +38,9 @@ def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False):
             env["TRTLLM_USE_UCX_KVCACHE"] = "1"
         case KVCacheType.NIXL:
             env["TRTLLM_USE_NIXL_KVCACHE"] = "1"
+        case KVCacheType.MOONCAKE:
+            env["TRTLLM_USE_MOONCAKE_KVCACHE"] = "1"
+            env["MC_FORCE_TCP"] = "1"
         case KVCacheType.NONE:
             pass
         case _:
@@ -502,8 +506,9 @@ def test_fused_gemm_allreduce(build_google_tests, nprocs, build_dir):
 
 @pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                          indirect=True)
-@pytest.mark.parametrize("kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX],
-                         ids=["nixl_kvcache", "ucx_kvcache"])
+@pytest.mark.parametrize(
+    "kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX, KVCacheType.MOONCAKE],
+    ids=["nixl_kvcache", "ucx_kvcache", "mooncake_kvcache"])
 @pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"])
 def test_cache_transceiver(build_google_tests, nprocs, kvcache_type, build_dir):
 
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 63817ed9af..4e90db0050 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -231,6 +231,7 @@ l0_dgx_h100:
   - cpp/test_multi_gpu.py::test_cache_transceiver[2proc-ucx_kvcache-90] ISOLATION
   - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90] ISOLATION
   - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90] ISOLATION
+  - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] ISOLATION
   - cpp/test_multi_gpu.py::test_user_buffer[2proc-90]
   - cpp/test_multi_gpu.py::test_enc_dec[t5-90]
   - cpp/test_multi_gpu.py::test_llama_executor[llama-orchestrator-90]

From a7ac5a6bca6eab92723ec2d4abacee940e56ad22 Mon Sep 17 00:00:00 2001
From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com>
Date: Fri, 19 Dec 2025 02:14:34 +0000
Subject: [PATCH 02/25] [None][infra] Check in most recent lock file from
 nightly pipeline

Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com>
---
 security_scanning/docs/poetry.lock            |   6 +-
 .../examples/models/core/qwen/poetry.lock     |  40 +-
 security_scanning/metadata.json               |   4 +-
 security_scanning/poetry.lock                 | 481 +++++++++++++++++-
 security_scanning/pyproject.toml              |   3 +
 security_scanning/triton_backend/poetry.lock  |   8 +-
 6 files changed, 508 insertions(+), 34 deletions(-)

diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock
index ac1ce39f45..f2f8e40c40 100644
--- a/security_scanning/docs/poetry.lock
+++ b/security_scanning/docs/poetry.lock
@@ -900,13 +900,13 @@ files = [
 
 [[package]]
 name = "soupsieve"
-version = "2.8"
+version = "2.8.1"
 description = "A modern CSS selector implementation for Beautiful Soup."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"},
-    {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"},
+    {file = "soupsieve-2.8.1-py3-none-any.whl", hash = "sha256:a11fe2a6f3d76ab3cf2de04eb339c1be5b506a8a47f2ceb6d139803177f85434"},
+    {file = "soupsieve-2.8.1.tar.gz", hash = "sha256:4cf733bc50fa805f5df4b8ef4740fc0e0fa6218cf3006269afd3f9d6d80fd350"},
 ]
 
 [[package]]
diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock
index 261179a625..a2004681e0 100644
--- a/security_scanning/examples/models/core/qwen/poetry.lock
+++ b/security_scanning/examples/models/core/qwen/poetry.lock
@@ -2927,30 +2927,30 @@ six = ">=1.14.0"
 
 [[package]]
 name = "ruff"
-version = "0.14.9"
+version = "0.14.10"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.14.9-py3-none-linux_armv6l.whl", hash = "sha256:f1ec5de1ce150ca6e43691f4a9ef5c04574ad9ca35c8b3b0e18877314aba7e75"},
-    {file = "ruff-0.14.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ed9d7417a299fc6030b4f26333bf1117ed82a61ea91238558c0268c14e00d0c2"},
-    {file = "ruff-0.14.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d5dc3473c3f0e4a1008d0ef1d75cee24a48e254c8bed3a7afdd2b4392657ed2c"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84bf7c698fc8f3cb8278830fb6b5a47f9bcc1ed8cb4f689b9dd02698fa840697"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aa733093d1f9d88a5d98988d8834ef5d6f9828d03743bf5e338bf980a19fce27"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a1cfb04eda979b20c8c19550c8b5f498df64ff8da151283311ce3199e8b3648"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1e5cb521e5ccf0008bd74d5595a4580313844a42b9103b7388eca5a12c970743"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd429a8926be6bba4befa8cdcf3f4dd2591c413ea5066b1e99155ed245ae42bb"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab208c1b7a492e37caeaf290b1378148f75e13c2225af5d44628b95fd7834273"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72034534e5b11e8a593f517b2f2f2b273eb68a30978c6a2d40473ad0aaa4cb4a"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:712ff04f44663f1b90a1195f51525836e3413c8a773574a7b7775554269c30ed"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a111fee1db6f1d5d5810245295527cda1d367c5aa8f42e0fca9a78ede9b4498b"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8769efc71558fecc25eb295ddec7d1030d41a51e9dcf127cbd63ec517f22d567"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:347e3bf16197e8a2de17940cd75fd6491e25c0aa7edf7d61aa03f146a1aa885a"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7715d14e5bccf5b660f54516558aa94781d3eb0838f8e706fb60e3ff6eff03a8"},
-    {file = "ruff-0.14.9-py3-none-win32.whl", hash = "sha256:df0937f30aaabe83da172adaf8937003ff28172f59ca9f17883b4213783df197"},
-    {file = "ruff-0.14.9-py3-none-win_amd64.whl", hash = "sha256:c0b53a10e61df15a42ed711ec0bda0c582039cf6c754c49c020084c55b5b0bc2"},
-    {file = "ruff-0.14.9-py3-none-win_arm64.whl", hash = "sha256:8e821c366517a074046d92f0e9213ed1c13dbc5b37a7fc20b07f79b64d62cc84"},
-    {file = "ruff-0.14.9.tar.gz", hash = "sha256:35f85b25dd586381c0cc053f48826109384c81c00ad7ef1bd977bfcc28119d5b"},
+    {file = "ruff-0.14.10-py3-none-linux_armv6l.whl", hash = "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49"},
+    {file = "ruff-0.14.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f"},
+    {file = "ruff-0.14.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830"},
+    {file = "ruff-0.14.10-py3-none-win32.whl", hash = "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6"},
+    {file = "ruff-0.14.10-py3-none-win_amd64.whl", hash = "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154"},
+    {file = "ruff-0.14.10-py3-none-win_arm64.whl", hash = "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6"},
+    {file = "ruff-0.14.10.tar.gz", hash = "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4"},
 ]
 
 [[package]]
diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
index 2356583a71..7d346cc119 100644
--- a/security_scanning/metadata.json
+++ b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "c1cfb61b1b0940e9212b68e7ee72d42c6126e242",
-  "timestamp": "2025-12-18T02:42:21Z"
+  "commit_hash": "2e88c86f1059f918390ff24f35c618e29e4e44c6",
+  "timestamp": "2025-12-19T01:45:06Z"
 }
diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock
index 286d967e6f..fce2b50e72 100644
--- a/security_scanning/poetry.lock
+++ b/security_scanning/poetry.lock
@@ -740,6 +740,82 @@ files = [
     {file = "colored-2.3.1.tar.gz", hash = "sha256:fe6e888e12dc16643daa0b108f785df6d0b48420084b5d0a567de27bb09a14d8"},
 ]
 
+[[package]]
+name = "contourpy"
+version = "1.3.2"
+description = "Python library for calculating contours of 2D quadrilateral grids"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934"},
+    {file = "contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631"},
+    {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f"},
+    {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2"},
+    {file = "contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0"},
+    {file = "contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a"},
+    {file = "contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445"},
+    {file = "contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7"},
+    {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83"},
+    {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd"},
+    {file = "contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f"},
+    {file = "contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878"},
+    {file = "contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2"},
+    {file = "contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe"},
+    {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441"},
+    {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e"},
+    {file = "contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912"},
+    {file = "contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73"},
+    {file = "contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb"},
+    {file = "contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841"},
+    {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422"},
+    {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef"},
+    {file = "contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f"},
+    {file = "contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9"},
+    {file = "contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f"},
+    {file = "contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b"},
+    {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52"},
+    {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd"},
+    {file = "contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1"},
+    {file = "contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69"},
+    {file = "contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c"},
+    {file = "contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16"},
+    {file = "contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad"},
+    {file = "contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0"},
+    {file = "contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5"},
+    {file = "contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5"},
+    {file = "contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54"},
+]
+
+[package.dependencies]
+numpy = ">=1.23"
+
+[package.extras]
+bokeh = ["bokeh", "selenium"]
+docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"]
+mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "types-Pillow"]
+test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
+test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
+
 [[package]]
 name = "cuda-bindings"
 version = "13.1.1"
@@ -841,6 +917,21 @@ opencl = ["nvidia-cuda-opencl (==13.0.85.*)"]
 profiler = ["nvidia-cuda-profiler-api (==13.0.85.*)"]
 sanitizer = ["nvidia-cuda-sanitizer-api (==13.0.85.*)"]
 
+[[package]]
+name = "cycler"
+version = "0.12.1"
+description = "Composable style cycles"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"},
+    {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"},
+]
+
+[package.extras]
+docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
+tests = ["pytest", "pytest-cov", "pytest-xdist"]
+
 [[package]]
 name = "datasets"
 version = "3.1.0"
@@ -1084,6 +1175,78 @@ tabulate = "*"
 torch = "*"
 tqdm = "*"
 
+[[package]]
+name = "fonttools"
+version = "4.61.1"
+description = "Tools to manipulate font files"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"},
+    {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"},
+    {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"},
+    {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"},
+    {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"},
+    {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"},
+    {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"},
+    {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"},
+    {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"},
+    {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"},
+    {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"},
+    {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"},
+    {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"},
+    {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"},
+    {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"},
+    {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"},
+    {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"},
+    {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"},
+    {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"},
+    {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"},
+    {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"},
+    {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"},
+    {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"},
+    {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"},
+    {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"},
+    {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"},
+    {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"},
+    {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"},
+    {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"},
+    {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"},
+    {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"},
+    {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"},
+    {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"},
+    {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"},
+    {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"},
+    {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"},
+    {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"},
+    {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"},
+    {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"},
+    {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"},
+    {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"},
+    {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"},
+    {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"},
+    {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"},
+    {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"},
+    {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"},
+    {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"},
+    {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"},
+    {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"},
+    {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"},
+]
+
+[package.extras]
+all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "pycairo", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0)", "xattr", "zopfli (>=0.1.4)"]
+graphite = ["lz4 (>=1.7.4.2)"]
+interpolatable = ["munkres", "pycairo", "scipy"]
+lxml = ["lxml (>=4.0)"]
+pathops = ["skia-pathops (>=0.5.0)"]
+plot = ["matplotlib"]
+repacker = ["uharfbuzz (>=0.45.0)"]
+symfont = ["sympy"]
+type1 = ["xattr"]
+unicode = ["unicodedata2 (>=17.0.0)"]
+woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"]
+
 [[package]]
 name = "frozenlist"
 version = "1.8.0"
@@ -1707,6 +1870,116 @@ files = [
 [package.dependencies]
 referencing = ">=0.31.0"
 
+[[package]]
+name = "kiwisolver"
+version = "1.4.9"
+description = "A fast implementation of the Cassowary constraint solver"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1"},
+    {file = "kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d"},
+]
+
 [[package]]
 name = "lark"
 version = "1.3.1"
@@ -2017,6 +2290,84 @@ files = [
     {file = "markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698"},
 ]
 
+[[package]]
+name = "matplotlib"
+version = "3.10.8"
+description = "Python plotting package"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"},
+    {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"},
+    {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"},
+    {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"},
+    {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"},
+    {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"},
+    {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"},
+    {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"},
+    {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"},
+    {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"},
+    {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"},
+    {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"},
+    {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"},
+    {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"},
+    {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"},
+    {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"},
+    {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"},
+    {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"},
+    {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"},
+    {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"},
+    {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"},
+    {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"},
+    {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"},
+    {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"},
+    {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"},
+    {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"},
+    {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"},
+    {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"},
+    {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"},
+    {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"},
+    {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"},
+    {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"},
+    {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"},
+    {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"},
+    {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"},
+    {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"},
+    {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"},
+    {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"},
+    {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"},
+    {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"},
+    {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"},
+]
+
+[package.dependencies]
+contourpy = ">=1.0.1"
+cycler = ">=0.10"
+fonttools = ">=4.22.0"
+kiwisolver = ">=1.3.1"
+numpy = ">=1.23"
+packaging = ">=20.0"
+pillow = ">=8"
+pyparsing = ">=3"
+python-dateutil = ">=2.7"
+
+[package.extras]
+dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"]
+
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -2551,6 +2902,75 @@ files = [
     {file = "ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978"},
 ]
 
+[[package]]
+name = "numexpr"
+version = "2.13.1"
+description = "Fast numerical expression evaluator for NumPy"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "numexpr-2.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdbc2b93ac59667f0ba725b24cd3b5559c300e91e179d09c74ebaf8c8961eef6"},
+    {file = "numexpr-2.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ad6b5dfc191c766e3ec89d2e3f956f7ef3181a1f8bf2bb00ec48fb3bf97b44ac"},
+    {file = "numexpr-2.13.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a12dbd4c07a8303c6f01cdade531d75c9b4f5b8f72cbe5821d8f9197ee6fba47"},
+    {file = "numexpr-2.13.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2de5c8ca2f25690d48e475d53a3524876164227cf4044743818f5704c28a8639"},
+    {file = "numexpr-2.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:533ec2d77fc059e3868e9798ef2f13ab57161517cd2e0c521bb33d1dc99068ca"},
+    {file = "numexpr-2.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a75ddffc36f6b7a679fbc7df492685aed7e8888aec80ec2cd8e30f21fc019caa"},
+    {file = "numexpr-2.13.1-cp310-cp310-win32.whl", hash = "sha256:790af35095626ad2d02201c56ac2d49ae45fc95a02af85f40808752ed32ee103"},
+    {file = "numexpr-2.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:aadf3118b6ef87294277ffb77a9562970228341aaaa4b78de634a43ea8ea2c6e"},
+    {file = "numexpr-2.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bdf62745e072c670151c0705bddfe3f33c341dacb7eb255ddb1e8d2a257bfef5"},
+    {file = "numexpr-2.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:91cf0521d8fed3f804640c4a6d22b5d9813d7e64b32c38215de163c7f092f7cc"},
+    {file = "numexpr-2.13.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58e2f111756fff63e27e495473d950e4c98bbebca55aa1572798b59110d6c84b"},
+    {file = "numexpr-2.13.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a5a37b74561ed8dbd5f9be182d94419fa53f452e2d7d3e8d6dbef35a20f19f7"},
+    {file = "numexpr-2.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78cb76676e63f02dcf507e3c563888018a68b6a2e2cd444628e09df270dfd0b2"},
+    {file = "numexpr-2.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d29b3351de4c43b56d2ef7f138ab7a8988e797291bcbbd56d545e4e7902f254a"},
+    {file = "numexpr-2.13.1-cp311-cp311-win32.whl", hash = "sha256:912488ddbd500937bb6f4dfc010bdb3bf757a76e0b93db2f2c56db49ef6b9351"},
+    {file = "numexpr-2.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:66d0292f3b9dc5faadb4dd8a89d733321ff01c9699aee0c3cdbf513c9505e39c"},
+    {file = "numexpr-2.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6aa48c2f2bfa142dfe260441486452be8f70b5551c17bc846fccf76123d4a226"},
+    {file = "numexpr-2.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:67a3dd8b51e94251f535a9a404f1ac939a3ebeb9398caad20ae9d0de37c6d3b3"},
+    {file = "numexpr-2.13.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca152998d44ea30b45ad6b8a050ac4a9408b61a17508df87ad0d919335d79b44"},
+    {file = "numexpr-2.13.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b4280c8f7cc024846be8fdd6582572bb0b6bad98fb2a68a367ef5e6e2e130d5f"},
+    {file = "numexpr-2.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b86e1daa4e27d6bf6304008ed4630a055babf863db2ec8f282b4058bbfe466bd"},
+    {file = "numexpr-2.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:30d189fc52ee4a33b869a0592553cd2ed686c20cded21b2ddf347a4d143f1bea"},
+    {file = "numexpr-2.13.1-cp312-cp312-win32.whl", hash = "sha256:e926b59d385de2396935b362143ac2c282176875cf8ee7baba0a150b58421b5c"},
+    {file = "numexpr-2.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:8230a8f7cd4e6ba4022643c85e119aa4ca90412267ef20acdf1f54fb3136680d"},
+    {file = "numexpr-2.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e4314ee477a2cfb9ecf4b15f2ef24bf7859f62b35de3caef297136ff25bb0b0"},
+    {file = "numexpr-2.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d82d088f67647861b61a7b0e0148fd7487000a20909d65734821dd27e0839a68"},
+    {file = "numexpr-2.13.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c615b13976e6332336a052d5b03be1fed231bc1afe07699f4c7cc116c7c3092c"},
+    {file = "numexpr-2.13.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4874124bccc3c2462558ad2a75029bcc2d1c63ee4914b263bb06339e757efb85"},
+    {file = "numexpr-2.13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0fc7b5b0f8d7ba6c81e948b1d967a56097194c894e4f57852ed8639fc653def2"},
+    {file = "numexpr-2.13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e22104ab53f0933b5b522829149990cb74e0a8ec4b69ff0e6545eb4641b3f013"},
+    {file = "numexpr-2.13.1-cp313-cp313-win32.whl", hash = "sha256:824aea72663ec123e042341cea4a2a2b3c71f315e4bc58ee5035ffc7f945bd29"},
+    {file = "numexpr-2.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:9c7b1c3e9f398a5b062d9740c48ca454238bf1be433f0f75fe68619527bb7f1a"},
+    {file = "numexpr-2.13.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:366a7887c2bad86e6f64666e178886f606cf8e81a6871df450d19f0f83421501"},
+    {file = "numexpr-2.13.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:33ff9f071d06aaa0276cb5e2369efd517fe155ea091e43790f1f8bfd85e64d29"},
+    {file = "numexpr-2.13.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c29a204b1d35941c088ec39a79c2e83e382729e4066b4b1f882aa5f70bf929a8"},
+    {file = "numexpr-2.13.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:40e02db74d66c5b0a81c925838f42ec2d58cc99b49cbaf682f06ac03d9ff4102"},
+    {file = "numexpr-2.13.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:36bd9a2b9bda42506377c7510c61f76e08d50da77ffb86a7a15cc5d57c56bb0f"},
+    {file = "numexpr-2.13.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b9203651668a3994cf3fe52e079ff6be1c74bf775622edbc226e94f3d8ec8ec4"},
+    {file = "numexpr-2.13.1-cp313-cp313t-win32.whl", hash = "sha256:b73774176b15fe88242e7ed174b5be5f2e3e830d2cd663234b1495628a30854c"},
+    {file = "numexpr-2.13.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9e6228db24b7faa96fbb2beee55f90fc8b0fe167cf288f8481c53ff5e95865a"},
+    {file = "numexpr-2.13.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cbadcbd2cf0822d595ccf5345c69478e9fe42d556b9823e6b0636a3efdf990f0"},
+    {file = "numexpr-2.13.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a189d514e8aa321ef1c650a2873000c08f843b3e3e66d69072005996ac25809c"},
+    {file = "numexpr-2.13.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b6b01e9301bed8f89f6d561d79dcaa8731a75cc50efc072526cfbc07df74226c"},
+    {file = "numexpr-2.13.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7749e8c0ff0bae41a534e56fab667e529f528645a0216bb64260773ae8cb697"},
+    {file = "numexpr-2.13.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0b0f326542185c23fca53e10fee3c39bdadc8d69a03c613938afaf3eea31e77f"},
+    {file = "numexpr-2.13.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:33cc6d662a606cc5184c7faef1d7b176474a8c46b8b0d2df9ff0fa67ed56425f"},
+    {file = "numexpr-2.13.1-cp314-cp314-win32.whl", hash = "sha256:71f442fd01ebfa77fce1bac37f671aed3c0d47a55e460beac54b89e767fbc0fa"},
+    {file = "numexpr-2.13.1-cp314-cp314-win_amd64.whl", hash = "sha256:208cd9422d87333e24deb2fe492941cd13b65dc8b9ce665de045a0be89e9a254"},
+    {file = "numexpr-2.13.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:37d31824b9c021078046bb2aa36aa1da23edaa7a6a8636ee998bf89a2f104722"},
+    {file = "numexpr-2.13.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:15cee07c74e4792993cd2ecd46c5683815e8758ac56e1d4d236d2c9eb9e8ae01"},
+    {file = "numexpr-2.13.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65cb46136f068ede2fc415c5f3d722f2c7dde3eda04ceafcfbcac03933f5d997"},
+    {file = "numexpr-2.13.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:abc3c1601380c90659b9ac0241357c5788ab58de148f56c5f98adffe293c308c"},
+    {file = "numexpr-2.13.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2836e900377ce27e99c043a35e008bc911c51781cea47623612a4e498dfa9592"},
+    {file = "numexpr-2.13.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f4e4c5b38bb5695fff119672c3462d9a36875256947bafb2df4117b3271fd6a3"},
+    {file = "numexpr-2.13.1-cp314-cp314t-win32.whl", hash = "sha256:156591eb23684542fd53ca1cbefff872c47c429a200655ef7e59dd8c03eeeaef"},
+    {file = "numexpr-2.13.1-cp314-cp314t-win_amd64.whl", hash = "sha256:a2cc21b2d2e59db63006f190dbf20f5485dd846770870504ff2a72c8d0406e4e"},
+    {file = "numexpr-2.13.1.tar.gz", hash = "sha256:ecb722249c2d6ed7fefe8504bb17e056481a5f31233c23a7ee02085c3d661fa1"},
+]
+
+[package.dependencies]
+numpy = ">=1.23.0"
+
 [[package]]
 name = "numpy"
 version = "1.26.4"
@@ -4060,6 +4480,20 @@ nvidia-ml-py = ">=12.0.0"
 [package.extras]
 test = ["pytest (>=3.6)", "pytest-cov", "pytest-runner"]
 
+[[package]]
+name = "pyparsing"
+version = "3.2.5"
+description = "pyparsing - Classes and methods to define and execute parsing grammars"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e"},
+    {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"},
+]
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
 [[package]]
 name = "pyproject-hooks"
 version = "1.2.0"
@@ -5210,19 +5644,56 @@ opt-einsum = ["opt-einsum (>=3.3)"]
 optree = ["optree (>=0.13.0)"]
 pyyaml = ["pyyaml"]
 
+[[package]]
+name = "torch-c-dlpack-ext"
+version = "0.1.3"
+description = "torch c dlpack ext"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:49f8a1eaea21443c338df7bcf93f9026274b910ab23850777a88db040608c0a1"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e2cb08aa7591a08b4992fc99b10e86b46a65d9a46c34d9697e8fab03bfcaf46"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9ae36f7d4ccd4a9806528fa8dc8f0e3cfc47530adff8c7b6a72762bc97643b0"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:f92a0582cfa28418924f94bd6b89f662555d73dcc7ca0de1cad78a4f04ebca26"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:770fd7618973f70bfea288d5c419bdf974fc578e84248341524bb1ed20b969fd"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:71de2233ff974f09379e84699af88e83aeb63dd885627123f745780ff592d15c"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78b963243b5b0e7d463fab365f31ec1569223845942f6591ab2ac067ad0f0338"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:b0244f282e0e74f2cefa843caeb601f5acfd88342029b0ca901dd40ab883818b"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b7d64453fa62c75551f2413cde55748a3461af475da386b2e709239555e07c3"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cd69fb034cd638eb0908767d74e5d0ea87df18d366b18d66c2c3472b29c80e5e"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8ebf732b5079912e0b85f32a75bae6932f021fbc13c2dff1c9f7cea437b71345"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:69685ac393f1f402c289ac04435120d518bde890388474fe2f8a58e7d290eb50"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5f87b18064c017edb240b1766e858d18fe9472c11180a2811216293376ba6ef0"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2afc7165195f4a256aab16147040d63a0cc55b7c530946d9726125268a54303a"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:96743df478df006b21ae18111f4a2528abcc46131389b8d99176c37c30559474"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-win_amd64.whl", hash = "sha256:74f491fe1ec64ff631a4844ef87339a1e825d375d87bad79ec8e9b922292a043"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:61d17b3be0c43c846e8ff4c54e5f05a35daeb8453fb14cec05742fcce41bada7"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa8bf3a52fc13306866282e204ee6979a0cabaf64c8ef8d6ee700d4c4b2519a1"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fa48bb2e613c3a1fec135edbde1c7923a20b7dc3a5a3f2d17be7e0a7d333b18"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-win_amd64.whl", hash = "sha256:d7344b830359c4ef3165c10a82de96daf711a38c21b18b82c30d9d8dcd3e4529"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:b81bfa08d3dc791f808610e1abf0603c745b8c82681009a089b3dae650b6ff61"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d079d66404ec3911c02d4fd4cd41f42ef56f1ebdd5ecd68bcc2f425cbd12d08e"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d03bf108eab58b2c6dbe7e94211f670422c961b1e1e32fbaec442d5359ac02bf"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:5ee661e6b910e67200ba7c911436a5af8be288f938883971a0cf5632645183c8"},
+    {file = "torch_c_dlpack_ext-0.1.3.tar.gz", hash = "sha256:4b5da66432af7224dcf02aad4f13cc416eeef5331cd153588b7e081a193f4972"},
+]
+
+[package.dependencies]
+torch = "*"
+
 [[package]]
 name = "torchao"
-version = "0.14.1"
+version = "0.15.0"
 description = "Package for applying ao techniques to GPU models"
 optional = false
 python-versions = "*"
 files = [
-    {file = "torchao-0.14.1-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f68db5e41952e88daa383fc2f358541e617654f388f508d5c7580c3bee9447"},
-    {file = "torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b"},
+    {file = "torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6"},
+    {file = "torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c"},
 ]
 
 [package.extras]
-dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
+dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest (==8.4.2)", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
 
 [[package]]
 name = "torchprofile"
@@ -5856,4 +6327,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "955bcecb84ae2d8555ba7c10772099be9a6451a8a00f61d5aa3b86d2666a4ef6"
+content-hash = "d44d9d44355bac8ca580030e7e4eeb0a7cfdff7cf25045ffd8f38d077b27306c"
diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml
index 4253fe2ac0..a9a9aa0e2a 100644
--- a/security_scanning/pyproject.toml
+++ b/security_scanning/pyproject.toml
@@ -62,6 +62,7 @@ llguidance = "0.7.29"
 jsonschema = "^4.25.1"
 backoff = "^2.2.1"
 nvtx = "^0.2.14"
+matplotlib = "^3.10.8"
 meson = "^1.10.0"
 ninja = "^1.13.0"
 etcd3 = {git = "https://github.com/kragniz/python-etcd3.git", rev = "e58a899579ba416449c4e225b61f039457c8072a"}
@@ -73,7 +74,9 @@ blobfile = "^3.1.0"
 openai-harmony = "0.0.4"
 nvidia-cutlass-dsl = "4.3.1"
 plotly = "^6.5.0"
+numexpr = "<2.14.0"
 partial-json-parser = "^0.2.1.1.post7"
+torch-c-dlpack-ext = "0.1.3"
 mistral-common = "1.8.6"
 torchao = ">=0.14.1"
 
diff --git a/security_scanning/triton_backend/poetry.lock b/security_scanning/triton_backend/poetry.lock
index 159351cf11..b530fa57c3 100644
--- a/security_scanning/triton_backend/poetry.lock
+++ b/security_scanning/triton_backend/poetry.lock
@@ -842,17 +842,17 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "pytest-asyncio", "r
 
 [[package]]
 name = "torchao"
-version = "0.14.1"
+version = "0.15.0"
 description = "Package for applying ao techniques to GPU models"
 optional = false
 python-versions = "*"
 files = [
-    {file = "torchao-0.14.1-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f68db5e41952e88daa383fc2f358541e617654f388f508d5c7580c3bee9447"},
-    {file = "torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b"},
+    {file = "torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6"},
+    {file = "torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c"},
 ]
 
 [package.extras]
-dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
+dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest (==8.4.2)", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
 
 [[package]]
 name = "tqdm"

From 9aa40871c2112fffdcf7b0a553c3786c5bd2c9ee Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Fri, 19 Dec 2025 10:54:15 +0800
Subject: [PATCH 03/25] [TRTLLM-9840][test] switch ucx backend to default
 backend (#10101)

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py
index 0c52852b9e..ff9dd92e0c 100644
--- a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py
+++ b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py
@@ -104,7 +104,7 @@ def gen_tp_pp_size(request):
 def worker(model_name: str, ctx_tp_pp_size: tuple, gen_tp_pp_size: tuple):
     extra_config = {
         "cache_transceiver_config": {
-            "backend": "UCX"
+            "backend": "DEFAULT"
         },
         "kv_cache_config": {
             "free_gpu_memory_fraction": 0.5,

From 00f70c30a66437606a1a8459ca668b245692b5e2 Mon Sep 17 00:00:00 2001
From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com>
Date: Fri, 19 Dec 2025 03:11:23 +0000
Subject: [PATCH 04/25] [None][infra] Check in most recent lock file from
 nightly pipeline

Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com>
---
 security_scanning/metadata.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
index 7d346cc119..084a98983f 100644
--- a/security_scanning/metadata.json
+++ b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "2e88c86f1059f918390ff24f35c618e29e4e44c6",
-  "timestamp": "2025-12-19T01:45:06Z"
+  "commit_hash": "a7ac5a6bca6eab92723ec2d4abacee940e56ad22",
+  "timestamp": "2025-12-19T02:39:13Z"
 }

From 72c5480dfb7ed36c406c0aa34ba45e7e73e07e85 Mon Sep 17 00:00:00 2001
From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
Date: Thu, 18 Dec 2025 19:12:05 -0800
Subject: [PATCH 05/25] [None][chore] Waive test blocking pre-merge 12/18
 (#10145)

Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 8c529626ec..3f80444c1f 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -479,6 +479,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5702793)
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5702793)
 disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/5748564)
+disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/5755963)
 unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516)
 unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521)
 unittest/llmapi/apps/_test_openai_responses.py::test_reasoning_effort[DeepSeek-R1-Distill-Qwen-1.5B] SKIP (https://nvbugs/5753250)

From 478b6b20a1dae771b29c93c1d55bf5be6e5ada36 Mon Sep 17 00:00:00 2001
From: William Zhang <133824995+2ez4bz@users.noreply.github.com>
Date: Thu, 18 Dec 2025 19:36:27 -0800
Subject: [PATCH 06/25] [#9230][refactor] Replace nemotron patches with custom
 model implementation (#9751)

[#9230][refactor] Replace nemotron patches with custom model implementation

* Why?

Patching for nemotron H models was growing out of hand, and made certain
optimizations more complex than they needed to be.

* What?

This commit finally gets rid of them, and replaces them with the custom
model implementation in `modeling_nemotron_h.py`.

Closes #9230
Closes NvBug 5747867

Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
---
 .../_torch/auto_deploy/custom_ops/rms_norm.py |  34 +++
 .../_torch/auto_deploy/models/__init__.py     |   2 -
 .../auto_deploy/models/custom/__init__.py     |   7 +
 .../{ => custom}/modeling_nemotron_h.py       | 149 ++++++-----
 .../auto_deploy/models/patches/nemotron_h.py  | 200 ---------------
 .../transform/library/quantization.py         |   2 +-
 .../auto_deploy/transform/library/rms_norm.py |   4 +-
 tests/integration/test_lists/waives.txt       |   4 -
 .../custom_ops/test_mamba_rms_norm.py         |  10 +-
 .../unit/singlegpu/models/test_hf.py          |   4 +-
 .../singlegpu/models/test_hybrid_patches.py   |  35 +--
 .../models/test_modeling_nemotron_h.py        | 235 ++++++++++++++++++
 .../models/test_nemotron_h_patches.py         | 158 ------------
 13 files changed, 379 insertions(+), 465 deletions(-)
 rename tensorrt_llm/_torch/auto_deploy/models/{ => custom}/modeling_nemotron_h.py (83%)
 create mode 100644 tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py
 delete mode 100644 tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py

diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
index 4265217453..7ce9b7befa 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
@@ -2,6 +2,8 @@
 
 import flashinfer
 import torch
+import torch.nn.functional as F
+from einops import rearrange
 
 from ...flashinfer_utils import get_env_enable_pdl
 from ...modules.mamba.layernorm_gated import _layer_norm_fwd
@@ -159,3 +161,35 @@ def _triton_rmsnorm_gated_meta(
         assert gate.shape == x.shape, "gate must match x shape"
 
     return x.new_empty(x.shape, dtype=torch.float32)
+
+
+# Forked from:
+# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py
+# NOTES:
+# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm`
+#    to be installed so as to be able to make use of its grouped gated RMS norm operation.
+#    We therefore replace it with one that uses einops + pytorch.
+def gated_rms_norm_ref(
+    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True
+):
+    dtype = x.dtype
+    # N = x.shape[-1]
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        z = z.float() if z is not None else z
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+    if group_size is None:
+        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
+    else:
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
+        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+    return out.to(dtype)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/__init__.py b/tensorrt_llm/_torch/auto_deploy/models/__init__.py
index 6eae19f23c..327d084bf0 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/__init__.py
@@ -1,4 +1,2 @@
-# TODO: When getting rid of the nemotron H patches, import `modeling_nemotron_h` here to ensure the
-# custom model implementation is registered.
 from . import custom, hf, nemotron_flash, patches
 from .factory import *
diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py b/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py
index fef9fdb166..e32f72f56f 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py
@@ -1 +1,8 @@
 from .modeling_nemotron_flash import NemotronFlashForCausalLM, NemotronFlashPreTrainedTokenizerFast
+from .modeling_nemotron_h import NemotronHForCausalLM
+
+__all__ = (
+    "NemotronFlashForCausalLM",
+    "NemotronFlashPreTrainedTokenizerFast",
+    "NemotronHForCausalLM",
+)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
similarity index 83%
rename from tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py
rename to tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
index 6a54617497..3756c054f7 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
@@ -25,17 +25,14 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
-from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import ModelOutput
 
-from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import (
-    _nemotron_h_moe_forward,
-    _nemotron_h_topk_router_forward,
-)
+from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import gated_rms_norm_ref
+from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
 
 
 class MambaRMSNormGated(torch.nn.Module):
@@ -46,7 +43,7 @@ class MambaRMSNormGated(torch.nn.Module):
         self.group_size = group_size
 
     def forward(self, hidden_states, gate=None):
-        return _rms_norm_ref(
+        return gated_rms_norm_ref(
             x=hidden_states,
             weight=self.weight,
             bias=None,
@@ -57,38 +54,6 @@ class MambaRMSNormGated(torch.nn.Module):
         )
 
 
-# Forked from:
-# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py
-# NOTES:
-# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm`
-#    to be installed so as to be able to make use of its grouped gated RMS norm operation.
-#    We therefore replace it with one that uses einops + pytorch.
-def _rms_norm_ref(
-    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True
-):
-    dtype = x.dtype
-    # N = x.shape[-1]
-    weight = weight.float()
-    bias = bias.float() if bias is not None else None
-    if upcast:
-        x = x.float()
-        z = z.float() if z is not None else z
-    if z is not None and not norm_before_gate:
-        x = x * F.silu(z)
-    if group_size is None:
-        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
-    else:
-        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
-        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
-        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
-        if bias is not None:
-            out = out + bias
-    if z is not None and norm_before_gate:
-        out *= F.silu(z)
-    return out.to(dtype)
-
-
 class NemotronHMamba2Mixer(nn.Module):
     """
     Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
@@ -149,9 +114,9 @@ class NemotronHMamba2Mixer(nn.Module):
         self.A_log._no_weight_decay = True
         # Instead of recomputing `torch.exp(self.A_log.float())` on every forward pass, we will register a hook
         # that sets this appropriately when loading weights.
-        # NOTE: we explicitly do NOT make this a `nn.Parameter` so that it does not appear in the state dict of
-        # this module, or an equivalent graph module trace from it.
-        self._minus_A = -A.float()
+        # NOTE: we explicitly register this as a non-persistent buffer so that it does not appear in the state dict of
+        # this module, or an equivalent graph module trace from it, but still gets included in e.g. `to()` calls.
+        self.register_buffer("_minus_A", -A.float(), persistent=False)
         self.norm = MambaRMSNormGated(
             self.intermediate_size,
             eps=self.layer_norm_epsilon,
@@ -317,8 +282,43 @@ class NemotronHMOE(nn.Module):
             layer_idx=layer_idx,
         )
 
-    # TODO: inline code from `_nemotron_h_moe_forward` when removing patches.
-    forward = _nemotron_h_moe_forward
+    def forward(self, hidden_states: torch.Tensor):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        x_flat = hidden_states.view(-1, hidden_states.shape[-1])
+
+        # NOTE: So far we've seen that the dispatch order in eager code is the same as the node order in the exported
+        # graph.
+        # We dispatch shared expert first so that we can easily fork the execution of the routed experts
+        # (using the custom op below) to an auxiliary stream.
+        shared_out = self.shared_experts(residuals)
+        # Check if this is a latent MOE (has fc1_latent_proj and fc2_latent_proj)
+        has_latent_proj = hasattr(self, "fc1_latent_proj") and hasattr(self, "fc2_latent_proj")
+
+        if has_latent_proj:
+            # Latent MOE: project to latent space before routing
+            x_flat = self.fc1_latent_proj(x_flat)
+
+        # Route through experts (operates in latent space if latent MOE, full space otherwise)
+        out_flat = torch.ops.auto_deploy.torch_moe(
+            x_flat,
+            topk_indices,
+            topk_weights,
+            w1_weight=[e.up_proj.weight for e in self.experts],
+            w2_weight=[e.down_proj.weight for e in self.experts],
+            w3_weight=[],
+            act_fn="relu2",
+            mlp_style="mlp",
+        )
+
+        if has_latent_proj:
+            # Latent MOE: project back from latent space
+            out_flat = self.fc2_latent_proj(out_flat)
+
+        routed_out = out_flat.view(*orig_shape)
+        out = shared_out + routed_out
+        return out
 
 
 class NemotronHTopkRouter(nn.Module):
@@ -339,22 +339,33 @@ class NemotronHTopkRouter(nn.Module):
             "e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32)
         )
 
-    forward = _nemotron_h_topk_router_forward
+    def forward(self, hidden_states):
+        """
+        Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel.
 
+        This replaces the original forward method which used pure PyTorch operations
+        with optimized CUDA kernels:
+        """
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        if self.weight.dtype == torch.float32:
+            router_logits = F.linear(hidden_states.type(torch.float32), self.weight)
+        else:
+            router_logits = torch.ops.trtllm.dsv3_router_gemm_op(
+                hidden_states, self.weight.t(), bias=None, out_dtype=torch.float32
+            )
 
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(
-        batch, num_key_value_heads, n_rep, slen, head_dim
-    )
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+        # Use the fused noaux_tc_op kernel which applies sigmoid internally
+        # and performs group-based top-k selection with normalization
+        topk_weights, topk_indices = torch.ops.trtllm.noaux_tc_op(
+            router_logits,
+            self.e_score_correction_bias,
+            self.n_group,
+            self.topk_group,
+            self.top_k,
+            self.routed_scaling_factor,
+        )
+
+        return topk_indices, topk_weights
 
 
 class NemotronHAttention(nn.Module):
@@ -369,8 +380,23 @@ class NemotronHAttention(nn.Module):
 
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        if config.head_dim is not None:
-            self.head_dim = config.head_dim
+
+        # At some point during NemotronH development, what used to be called `attention_head_dim`
+        # was renamed to `head_dim`. Since no configuration class's code (nor the modeling code,
+        # for that matter) was ever upstreamed into `transformers`, we have to resort to the below
+        # hack in order to support multiple iterations of NemotronH models.
+        if hasattr(config, "head_dim"):
+            head_dim = config.head_dim
+        elif hasattr(config, "attention_head_dim"):
+            head_dim = config.attention_head_dim
+        else:
+            raise AttributeError(
+                "Expected either `head_dim` or `attention_head_dim` to be present in the config "
+                "class, found neither."
+            )
+
+        if head_dim is not None:
+            self.head_dim = head_dim
         else:
             self.head_dim = config.hidden_size // config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
@@ -594,7 +620,4 @@ class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
         return NemotronHCausalLMOutput(logits)
 
 
-# TODO: uncomment after removing patches (and make sure it is imported in `__init__.py`).
-# from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
-#
-# AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM)
+AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
index 095e47f299..e69de29bb2 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
@@ -1,200 +0,0 @@
-import contextlib
-import importlib.util
-import sys
-import types
-from typing import Callable, Dict, List, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import AutoModelForCausalLM
-
-from tensorrt_llm._torch.auto_deploy.models.patches.bamba import _bamba_mixer_torch_forward
-
-
-# Forked from:
-# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py
-# NOTES:
-# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm`
-#    to be installed so as to be able to make use of its grouped gated RMS norm operation.
-#    We therefore replace it with one that uses einops + pytorch.
-def _rms_norm_ref(
-    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True
-):
-    dtype = x.dtype
-    # N = x.shape[-1]
-    weight = weight.float()
-    bias = bias.float() if bias is not None else None
-    if upcast:
-        x = x.float()
-        z = z.float() if z is not None else z
-    if z is not None and not norm_before_gate:
-        x = x * F.silu(z)
-    if group_size is None:
-        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
-    else:
-        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
-        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
-        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
-        if bias is not None:
-            out = out + bias
-    if z is not None and norm_before_gate:
-        out *= F.silu(z)
-    return out.to(dtype)
-
-
-# The original implementation looks at `cache_position[0]` to decide what to do which does not
-# play well with export. Plus, we do not want it to be updated anyway.
-def _nemotron_h_model_update_mamba_mask(self, attention_mask, cache_position):
-    return None
-
-
-def _nemotron_h_model_update_causal_mask(self, attention_mask, input_tensor, cache_position):
-    # Force attention to use causal mode without explicit masks
-    return None
-
-
-def _nemotron_h_block_forward(
-    self,
-    hidden_states,
-    cache_params=None,
-    cache_position: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-):
-    device = hidden_states.device
-    with contextlib.ExitStack() as stack:
-        if device.type == "cuda":
-            stack.enter_context(torch.cuda.stream(torch.cuda.default_stream(device)))
-        # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs
-        residual = hidden_states
-        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
-        if self.residual_in_fp32:
-            residual = residual.to(torch.float32)
-
-        if self.block_type == "mamba":
-            hidden_states = self.mixer(
-                hidden_states, cache_params=cache_params, cache_position=cache_position
-            )
-        elif self.block_type == "attention":
-            hidden_states = self.mixer(hidden_states, cache_position=cache_position)
-            hidden_states = hidden_states[0]
-        elif self.block_type in ["mlp", "moe"]:
-            hidden_states = self.mixer(hidden_states)
-        else:
-            raise ValueError(f"Invalid block_type: {self.block_type}")
-
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-def _nemotron_h_topk_router_forward(self, hidden_states):
-    """
-    Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel.
-
-    This replaces the original forward method which used pure PyTorch operations
-    with optimized CUDA kernels:
-    """
-    hidden_states = hidden_states.view(-1, self.config.hidden_size)
-    if self.weight.dtype == torch.float32:
-        router_logits = F.linear(hidden_states.type(torch.float32), self.weight)
-    else:
-        router_logits = torch.ops.trtllm.dsv3_router_gemm_op(
-            hidden_states, self.weight.t(), bias=None, out_dtype=torch.float32
-        )
-
-    # Use the fused noaux_tc_op kernel which applies sigmoid internally
-    # and performs group-based top-k selection with normalization
-    topk_weights, topk_indices = torch.ops.trtllm.noaux_tc_op(
-        router_logits,
-        self.e_score_correction_bias,
-        self.n_group,
-        self.topk_group,
-        self.top_k,
-        self.routed_scaling_factor,
-    )
-
-    return topk_indices, topk_weights
-
-
-# Note: we assume experts have no bias for now
-def _nemotron_h_moe_forward(self, hidden_states: torch.Tensor):
-    """
-    Uses NemotronH router (returns indices, weights) and dispatches through auto_deploy::torch_moe
-    with act_fn='relu2'. Handles both latent MOE and direct MOE architectures.
-    """
-
-    residuals = hidden_states
-    orig_shape = hidden_states.shape
-    topk_indices, topk_weights = self.gate(hidden_states)
-    x_flat = hidden_states.view(-1, hidden_states.shape[-1])
-
-    # NOTE: So far we've seen that the dispatch order in eager code is the same as the node order in the exported graph.
-    # We dispatch shared expert first so that we can easily fork the execution of the routed experts
-    # (using the custom op below) to an auxiliary stream.
-    shared_out = self.shared_experts(residuals)
-    # Check if this is a latent MOE (has fc1_latent_proj and fc2_latent_proj)
-    has_latent_proj = hasattr(self, "fc1_latent_proj") and hasattr(self, "fc2_latent_proj")
-
-    if has_latent_proj:
-        # Latent MOE: project to latent space before routing
-        x_flat = self.fc1_latent_proj(x_flat)
-
-    # Route through experts (operates in latent space if latent MOE, full space otherwise)
-    out_flat = torch.ops.auto_deploy.torch_moe(
-        x_flat,
-        topk_indices,
-        topk_weights,
-        w1_weight=[e.up_proj.weight for e in self.experts],
-        w2_weight=[e.down_proj.weight for e in self.experts],
-        w3_weight=[],
-        act_fn="relu2",
-        mlp_style="mlp",
-    )
-
-    if has_latent_proj:
-        # Latent MOE: project back from latent space
-        out_flat = self.fc2_latent_proj(out_flat)
-
-    routed_out = out_flat.view(*orig_shape)
-    out = shared_out + routed_out
-    return out
-
-
-_from_config_original = AutoModelForCausalLM.from_config
-
-CUSTOM_MODULE_PATCHES: Dict[str, List[Tuple[str, Callable]]] = {
-    "NemotronHMamba2Mixer": [("forward", _bamba_mixer_torch_forward)],
-    "NemotronHModel": [
-        ("_update_causal_mask", _nemotron_h_model_update_causal_mask),
-        ("_update_mamba_mask", _nemotron_h_model_update_mamba_mask),
-    ],
-    "NemotronHBlock": [("forward", _nemotron_h_block_forward)],
-    "NemotronHMOE": [("forward", _nemotron_h_moe_forward)],
-    "NemotronHTopkRouter": [("forward", _nemotron_h_topk_router_forward)],
-}
-
-
-def get_model_from_config_patched(config, **kwargs):
-    model = _from_config_original(config, **kwargs)
-    # Patch modules
-    for _, module in model.named_modules():
-        if (module_name := type(module).__name__) in CUSTOM_MODULE_PATCHES.keys():
-            patches = CUSTOM_MODULE_PATCHES[module_name]
-            for method_name, method_patch in patches:
-                setattr(module, method_name, types.MethodType(method_patch, module))
-
-    return model
-
-
-# TODO: figure out how this can be incorporated into the export patch system
-AutoModelForCausalLM.from_config = get_model_from_config_patched
-
-# TODO: figure out how this can be incorporated into the export patch system
-# Only patch if the module isn't available
-_mamba_ssm_module = "mamba_ssm"
-_mamba_ssm_submodule = f"{_mamba_ssm_module}.ops.triton.layernorm_gated"
-if importlib.util.find_spec(_mamba_ssm_module) is None:
-    stub_mod = types.ModuleType(_mamba_ssm_submodule)
-    stub_mod.rmsnorm_fn = _rms_norm_ref
-    sys.modules[_mamba_ssm_submodule] = stub_mod
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
index 28c61e74dd..2fdaaf5506 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -123,7 +123,7 @@ class Quantization(BaseTransform):
             cnt += 1
 
         return gm, TransformInfo(
-            skipped=False, num_matches=cnt, is_clean=False, has_valid_shapes=True
+            skipped=False, num_matches=cnt, is_clean=False, has_valid_shapes=(cnt == 0)
         )
 
     def _insert_quantized_linear(
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
index 36c2e683bf..860b5b7de5 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
@@ -7,8 +7,8 @@ import torch
 from pydantic import Field
 from torch.fx import GraphModule
 
+from ...custom_ops.rms_norm import gated_rms_norm_ref
 from ...models.factory import ModelFactory
-from ...models.patches.nemotron_h import _rms_norm_ref
 from ...shim.interface import CachedSequenceInterface
 
 # It is important to import ADPatternMatcherPass from pattern_matcher.py, not from torch._inductor.pattern_matcher
@@ -225,7 +225,7 @@ def _gated_rmsnorm_pattern_ref(
     eps: float = 1e-5,
     group_size: int = 512,
 ) -> torch.Tensor:
-    y = _rms_norm_ref(
+    y = gated_rms_norm_ref(
         x,
         weight,
         bias=None,
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 3f80444c1f..4f6d8e75a4 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -460,10 +460,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
 test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920)
 examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5747930)
 test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_patch_forward[dtype0-2-6-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_patch_forward[dtype0-1-8-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_custom_implementation[dtype0-2-6-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_custom_implementation[dtype0-1-8-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
 unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args8] SKIP (https://nvbugs/5747878)
 unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args9] SKIP (https://nvbugs/5747878)
 triton_server/test_triton.py::test_opt[opt] SKIP (https://nvbugs/5739981)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
index 35b293686d..59952a6c89 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
@@ -1,8 +1,10 @@
 import pytest
 import torch
 
-import tensorrt_llm._torch.auto_deploy.custom_ops  # noqa: F401
-from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import _rms_norm_ref
+from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import (
+    gated_rms_norm_ref,
+    triton_rmsnorm_gated,
+)
 
 
 @pytest.mark.skipif(
@@ -19,12 +21,12 @@ def test_custom_op_matches_ref(B, T, H, group, use_gate, dtype):
     z = torch.randn_like(x) if use_gate else None
     w = torch.ones(H, dtype=dtype, device=device)
 
-    y_ref = _rms_norm_ref(
+    y_ref = gated_rms_norm_ref(
         x, w, bias=None, z=z, eps=1e-5, group_size=group, norm_before_gate=False, upcast=True
     )
 
     # Custom op (currently returns fp32). Cast it back to x.dtype for apples-to-apples with ref.
-    y_op_fp32 = torch.ops.auto_deploy.triton_rmsnorm_gated(x, w, z, 1e-5, group, False)
+    y_op_fp32 = triton_rmsnorm_gated(x, w, z, 1e-5, group, False)
     y_op = y_op_fp32.to(x.dtype)
 
     assert y_ref.dtype == x.dtype and y_op.dtype == x.dtype
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py
index 82f3774511..d6e63b6433 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py
@@ -205,10 +205,12 @@ def test_custom_model_mapping_in_parent_does_not_affect_parent():
     class Child(AutoModelForCausalLMFactory):
         pass
 
+    parent_mapping = copy.copy(AutoModelForCausalLMFactory._custom_model_mapping)
+
     custom_model_cls = MagicMock(spec=AutoModelForCausalLM)
     custom_model_cls.configure_mock(_from_config=MagicMock(side_effect=MyError))
     Child.register_custom_model_cls(
         config_cls_name=FooConfig.__name__, custom_model_cls=custom_model_cls
     )
 
-    assert AutoModelForCausalLMFactory._custom_model_mapping == {}
+    assert AutoModelForCausalLMFactory._custom_model_mapping == parent_mapping
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py
index ceabe6c1b9..6ea5c0efa1 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py
@@ -1,5 +1,3 @@
-import copy
-
 import pytest
 import torch
 from _model_test_utils import get_small_model_config
@@ -7,8 +5,6 @@ from torch.export import Dim
 
 from tensorrt_llm._torch.auto_deploy.export import apply_export_patches, torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.llm_args import AutoDeployConfig
-from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
-from tensorrt_llm._torch.auto_deploy.models.modeling_nemotron_h import NemotronHForCausalLM
 from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device
 
 # NOTE: find example inputs with the same tokenization length to avoid seq concat.
@@ -16,37 +12,15 @@ EXAMPLE_INPUT = "Mamba is a snake with the following properties:"
 EXAMPLE_INPUT2 = "Tiger is a cat with the following properties:"
 
 
-@pytest.fixture
-def setup_custom_model_cls_registry(request):
-    # TODO: remove all this when the patches in `bamba.py` and `nemotron_h.py` can be removed.
-    old_mapping = copy.copy(AutoModelForCausalLMFactory._custom_model_mapping)
-    AutoModelForCausalLMFactory._custom_model_mapping = {}
-
-    register_custom_model = request.node.callspec.params.get("register_custom_model", False)
-    if register_custom_model:
-        AutoModelForCausalLMFactory.register_custom_model_cls(
-            config_cls_name="NemotronHConfig",
-            custom_model_cls=NemotronHForCausalLM,
-        )
-    yield
-    AutoModelForCausalLMFactory._custom_model_mapping = old_mapping
-
-
 @pytest.mark.parametrize(
-    "model_dir,run_verify_generation,register_custom_model",
+    "model_dir,run_verify_generation",
     [
-        ("ibm-ai-platform/Bamba-9B-v2", True, False),
-        # This tests the incumbent patching approach.
-        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True, False),
-        # This tests the new custom model implementation.
-        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True, True),
+        ("ibm-ai-platform/Bamba-9B-v2", True),
     ],
 )
 def test_bamba_patches(
     model_dir: str,
     run_verify_generation: bool,
-    register_custom_model: bool,
-    setup_custom_model_cls_registry,
 ):
     # NOTE: set to False if you want to locally test the full model.
     use_small_config: bool = True
@@ -124,13 +98,14 @@ def test_bamba_patches(
         move_to_device(gm, "cuda")
         factory._to_maybe_random(model, "cuda")
         model.load_state_dict(gm.state_dict())
+        gm.load_state_dict(model.state_dict())
     else:
         factory.load_or_random_init(model, device="cuda")
         gm = _run_torch_export_to_gm()
         move_to_device(gm, "cuda")
 
     if run_verify_generation:
-        _verify_generation(factory, model, tokenizer)
+        _verify_generation(model, tokenizer)
 
     # let's do a comparison of every state dict item between the model and the gm
     torch.testing.assert_close(model.state_dict(), gm.state_dict(), rtol=0.0, atol=0.0)
@@ -157,7 +132,7 @@ def test_bamba_patches(
         )
 
 
-def _verify_generation(factory, model, tokenizer):
+def _verify_generation(model, tokenizer):
     print("====== WITHOUT PATCH ======")
     _generate(tokenizer, model)
     with apply_export_patches(patch_list=["bamba"]):
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py
new file mode 100644
index 0000000000..94b22ed14f
--- /dev/null
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py
@@ -0,0 +1,235 @@
+import importlib.util
+import sys
+import types
+from unittest import mock
+
+import pytest
+import torch
+from _model_test_utils import get_small_model_config
+from torch.export import Dim
+from transformers import AutoConfig, AutoModelForCausalLM
+from utils.llm_data import llm_models_root
+
+from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
+from tensorrt_llm._torch.auto_deploy.llm_args import AutoDeployConfig
+from tensorrt_llm._torch.auto_deploy.models.custom.modeling_nemotron_h import NemotronHForCausalLM
+from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device
+
+_BATCH_AND_SEQUENCE_TEST_CASES = ((2, 6), (1, 8))
+
+
+@pytest.fixture(scope="function", autouse=True)
+def set_seed():
+    torch.manual_seed(42)
+
+
+@pytest.fixture(autouse=True)
+def stub_mamba_ssm_if_missing():
+    """Stub `mamba_ssm` package.
+
+    The `modeling_nemotron_h.py` code in all recent nemotron checkpoints have a hard dependency
+    on `mamba_ssm.ops.triton.layernorm_gated.rmsnorm_fn`. This fixture stubs it, such that we
+    at least can get past the import stage of the remote modeling code.
+    """
+    module = "mamba_ssm"
+    submodule = f"{module}.ops.triton.layernorm_gated"
+
+    if importlib.util.find_spec(module) is not None:
+        yield
+        return
+
+    stub_mod = types.ModuleType(submodule)
+    stub_mod.rmsnorm_fn = None
+
+    with mock.patch.dict(sys.modules, {submodule: stub_mod}):
+        yield
+
+
+def _load_nemotron_moe_layer(model_name_or_path: str, custom_model_cls=None):
+    """
+    Build a tiny NemotronH model (1 layer, small dims) and return the first NemotronHMOE module.
+    """
+    cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+
+    cfg.use_cache = False
+
+    cfg.torch_dtype = "bfloat16"
+    cfg.hidden_size = 32
+    cfg.intermediate_size = 64
+    cfg.moe_intermediate_size = 64
+    cfg.moe_shared_expert_intermediate_size = 64
+    cfg.mamba_head_dim = 40
+    cfg.mamba_num_heads = 4
+    cfg.n_groups = 2
+    cfg.num_attention_heads = 4
+    cfg.num_hidden_layers = 9
+    cfg.num_key_value_heads = 2
+    cfg.ssm_state_size = 32
+
+    if custom_model_cls is None:
+        model = AutoModelForCausalLM.from_config(cfg, trust_remote_code=True)
+    else:
+        model = custom_model_cls._from_config(cfg)
+    model.eval()
+
+    nemotron_moe = None
+    for _, mod in model.named_modules():
+        if type(mod).__name__ == "NemotronHMOE":
+            nemotron_moe = mod
+            break
+
+    if nemotron_moe is None:
+        raise RuntimeError("NemotronHMOE layer not found. Check your model id or config.")
+
+    _set_gate_weights(nemotron_moe)
+
+    return nemotron_moe
+
+
+def _set_gate_weights(module):
+    # This helper function is necessary because the `weight` parameter of the `NemotronHTopkRouter`
+    # is initialized as `torch.empty` in the original model code, which no manner of random seed
+    # setting will have any effect on. We therefore set it like the below to ensure the
+    # reproducibility of the tests.
+    for _, mod in module.named_modules():
+        if type(mod).__name__ == "NemotronHTopkRouter":
+            if hasattr(mod, "weight"):
+                mod.weight = torch.nn.Parameter(torch.randn_like(mod.weight))
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        llm_models_root() / "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+    ],
+)
+@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@torch.no_grad()
+def test_nemotronh_moe_custom_implementation(model_name, B, S, dtype):
+    device = "cuda"
+
+    module = _load_nemotron_moe_layer(model_name)
+    module.to(device)
+
+    H = module.config.hidden_size
+    x = torch.randn(B, S, H, device=device, dtype=dtype)
+
+    ref = module(x)
+
+    new_module = _load_nemotron_moe_layer(model_name, custom_model_cls=NemotronHForCausalLM)
+    new_module.to(device)
+    new_module.load_state_dict(module.state_dict())
+
+    test = new_module(x)
+
+    rtol = 0.05
+    atol = 0.05
+
+    torch.testing.assert_close(test, ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize(
+    "model_dir,model_on_meta_during_export",
+    [
+        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True),
+        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", False),
+    ],
+)
+def test_custom_model_implementation_can_be_exported(
+    model_dir: str,
+    model_on_meta_during_export: bool,
+):
+    # NOTE: set to False if you want to locally test the full model.
+    use_small_config: bool = True
+
+    common_kwargs = {
+        "world_size": 0,
+        "runtime": "demollm",
+        "model_factory": "AutoModelForCausalLM",
+        "max_seq_len": 512,
+        "transforms": {
+            "insert_cached_attention": {"backend": "flashinfer"},
+            "compile_model": {"backend": "torch-simple"},
+        },
+    }
+
+    if use_small_config:
+        llm_args = get_small_model_config(model_dir, **common_kwargs)["args"]
+    else:
+        llm_args = {
+            "model": model_dir,
+            **common_kwargs,
+            "model_kwargs": {
+                "dtype": "bfloat16",
+            },
+        }
+    llm_args = AutoDeployConfig(**llm_args)
+
+    factory = llm_args.create_factory()
+    model = factory.build_model("meta")
+    tokenizer = factory.init_tokenizer()
+
+    # 1. Export wants min batch size of 2 (to avoid specialization during export).
+    # 2. Can't get `padding` / `truncation` to work without other steps so just use the prompts
+    #    with the same tokenized length in order for the tokenizer not to complain when creating
+    #    the tensor.
+    message = [
+        "Mamba is a snake with the following properties:",
+        "Tiger is a cat with the following properties:",
+    ]
+    inputs = tokenizer(message, return_tensors="pt", return_token_type_ids=False).to("cuda")
+
+    input_ids = inputs["input_ids"]
+    position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).repeat(
+        input_ids.shape[0], 1
+    )
+    dynamic_shapes = (
+        {0: Dim("batch_size", min=0, max=8), 1: Dim("seq_len", min=0, max=512)},
+        {
+            0: Dim("batch_size", min=0, max=8),
+            1: Dim("seq_len", min=0, max=512),
+        },
+    )
+
+    def _run_torch_export_to_gm():
+        return torch_export_to_gm(
+            model,
+            args=tuple(),
+            kwargs={"input_ids": input_ids, "position_ids": position_ids},
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    if model_on_meta_during_export:
+        gm = _run_torch_export_to_gm()
+        factory.load_or_random_init(gm, device="cuda")
+        move_to_device(gm, "cuda")
+        factory._to_maybe_random(model, "cuda")
+        # In order to ensure the `_minus_A` (non-persistent buffer) is correct, we need to run the
+        # model's load state pre/post hooks by loading the state dicts after initialization.
+        # NOTE: this is done under the hood by `torch_export_to_gm`, so we only need this in this
+        # `if` clause.
+        model.load_state_dict(gm.state_dict())
+        gm.load_state_dict(model.state_dict())
+    else:
+        factory.load_or_random_init(model, device="cuda")
+        gm = _run_torch_export_to_gm()
+        move_to_device(gm, "cuda")
+
+    # let's do a comparison of every state dict item between the model and the gm
+    torch.testing.assert_close(model.state_dict(), gm.state_dict(), rtol=0.0, atol=0.0)
+    torch.testing.assert_close(
+        dict(model.named_buffers()), dict(gm.named_buffers()), rtol=0.0, atol=0.0
+    )
+
+    with torch.inference_mode():
+        out_original = model(input_ids=input_ids, position_ids=position_ids)
+        out_gm = gm(input_ids=input_ids, position_ids=position_ids)
+
+    atol, rtol = 1e-3, 1e-3
+    torch.testing.assert_close(
+        out_gm,
+        out_original,
+        rtol=rtol,
+        atol=atol,
+    )
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py
deleted file mode 100644
index 3ef4e8eb54..0000000000
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import functools
-import types
-
-import pytest
-import torch
-from _model_test_utils import _hf_model_dir_or_hub_id
-from transformers import AutoConfig
-
-from tensorrt_llm._torch.auto_deploy.models.modeling_nemotron_h import NemotronHForCausalLM
-from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import (
-    _from_config_original,
-    _nemotron_h_moe_forward,
-)
-
-_BATCH_AND_SEQUENCE_TEST_CASES = ((2, 6), (1, 8))
-
-
-@pytest.fixture(scope="function", autouse=True)
-def set_seed():
-    torch.manual_seed(42)
-
-
-def skip_on_no_hf_access(func):
-    """Decorator for skipping tests that fail due to HF access issues.
-
-    This allows us to share the same test code for CI (where access may be restricted, especially for private
-    repositories) and locally.
-    """
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except OSError as e:
-            if "not a valid model identifier" in str(e):
-                pytest.skip("Test skipped due to (no) HF access.")
-            raise
-
-    return wrapper
-
-
-def _load_nemotron_moe_layer(model_name_or_path: str, custom_model_cls=None):
-    """
-    Build a tiny NemotronH model (1 layer, small dims) and return the first NemotronHMOE module.
-    """
-    cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
-
-    cfg.use_cache = False
-
-    cfg.torch_dtype = "bfloat16"
-    cfg.hidden_size = 32
-    cfg.intermediate_size = 64
-    cfg.moe_intermediate_size = 64
-    cfg.moe_shared_expert_intermediate_size = 64
-    cfg.mamba_head_dim = 40
-    cfg.mamba_num_heads = 4
-    cfg.n_groups = 2
-    cfg.num_attention_heads = 4
-    cfg.num_hidden_layers = 9
-    cfg.num_key_value_heads = 2
-    cfg.ssm_state_size = 32
-
-    if custom_model_cls is None:
-        model = _from_config_original(cfg, trust_remote_code=True)
-    else:
-        model = custom_model_cls._from_config(cfg)
-    model.eval()
-
-    nemotron_moe = None
-    for _, mod in model.named_modules():
-        if type(mod).__name__ == "NemotronHMOE":
-            nemotron_moe = mod
-            break
-
-    if nemotron_moe is None:
-        raise RuntimeError("NemotronHMOE layer not found. Check your model id or config.")
-
-    _set_gate_weights(nemotron_moe)
-
-    return nemotron_moe
-
-
-def _set_gate_weights(module):
-    # This helper function is necessary because the `weight` parameter of the `NemotronHTopkRouter`
-    # is initialized as `torch.empty` in the original model code, which no manner of random seed
-    # setting will have any effect on. We therefore set it like the below to ensure the
-    # reproducibility of the tests.
-    for _, mod in module.named_modules():
-        if type(mod).__name__ == "NemotronHTopkRouter":
-            if hasattr(mod, "weight"):
-                mod.weight = torch.nn.Parameter(torch.randn_like(mod.weight))
-
-
-@pytest.mark.parametrize(
-    "model_name",
-    [
-        _hf_model_dir_or_hub_id(
-            "NVIDIA-Nemotron-Nano-31B-A3-v3", "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3"
-        ),
-    ],
-)
-@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@torch.no_grad()
-@skip_on_no_hf_access
-def test_nemotronh_moe_patch_forward(model_name, B, S, dtype):
-    device = "cuda"
-
-    module = _load_nemotron_moe_layer(model_name)
-    module.to(device)
-
-    H = module.config.hidden_size
-    x = torch.randn(B, S, H, device=device, dtype=dtype)
-
-    ref = module(x)
-
-    module.forward = types.MethodType(_nemotron_h_moe_forward, module)
-    test = module(x)
-
-    rtol = 0.05
-    atol = 0.05
-
-    torch.testing.assert_close(test, ref, rtol=rtol, atol=atol)
-
-
-@pytest.mark.parametrize(
-    "model_name",
-    [
-        _hf_model_dir_or_hub_id(
-            "NVIDIA-Nemotron-Nano-31B-A3-v3", "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3"
-        ),
-    ],
-)
-@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@torch.no_grad()
-@skip_on_no_hf_access
-def test_nemotronh_moe_custom_implementation(model_name, B, S, dtype):
-    device = "cuda"
-
-    module = _load_nemotron_moe_layer(model_name)
-    module.to(device)
-
-    H = module.config.hidden_size
-    x = torch.randn(B, S, H, device=device, dtype=dtype)
-
-    ref = module(x)
-
-    new_module = _load_nemotron_moe_layer(model_name, custom_model_cls=NemotronHForCausalLM)
-    new_module.to(device)
-    new_module.load_state_dict(module.state_dict())
-
-    test = new_module(x)
-
-    rtol = 0.05
-    atol = 0.05
-
-    torch.testing.assert_close(test, ref, rtol=rtol, atol=atol)

From 48dbc61129ea706dbe96c8ac21be290547e464c9 Mon Sep 17 00:00:00 2001
From: Larry Xu <197874197+LarryXFly@users.noreply.github.com>
Date: Fri, 19 Dec 2025 13:38:21 +0800
Subject: [PATCH 07/25] [None][chore] Update CODEOWNERS for test cases and test
 list (#10119)

Signed-off-by: LarryXFly <197874197+LarryXFly@users.noreply.github.com>
---
 .github/CODEOWNERS | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 35181f4f3d..91062b138a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,5 +1,18 @@
 # This file defines code ownership rules for the repository.
 
+## TensorRT-LLM QA
+### Integration Tests
+/tests/integration/test_lists/qa @NVIDIA/trt-llm-qa
+/tests/integration/defs/examples/test_ray.py @NVIDIA/trt-llm-qa-function
+/tests/integration/defs/examples/test_redrafter.py @NVIDIA/trt-llm-qa-function
+/tests/integration/defs/accuracy @NVIDIA/trt-llm-qa-function
+/tests/integration/defs/stress_test @NVIDIA/trt-llm-qa-function
+/tests/integration/defs/triton_server @NVIDIA/trt-llm-qa-function
+/tests/integration/defs/test_e2e.py @NVIDIA/trt-llm-qa-function
+/tests/integration/defs/disaggregated @NVIDIA/trt-llm-qa-serving
+/tests/integration/defs/sysinfo @NVIDIA/trt-llm-qa-perf
+/tests/integration/defs/perf @NVIDIA/trt-llm-qa-perf
+/tests/integration/defs/perf/disagg @NVIDIA/trt-llm-qa-serving
 
 ## TensorRT-LLM Infra
 ### CI

From 70b4d282c6b3a1f77473e79dd3c429967d7ce69c Mon Sep 17 00:00:00 2001
From: Ziyi Xiong <219238287+ziyixiong-nv@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:11:25 +0800
Subject: [PATCH 08/25] [TRTLLM-7736][feat] Incrementally update the inputs of
 target and draft models (#9708)

Signed-off-by: ziyixiong-nv <219238287+ziyixiong-nv@users.noreply.github.com>
---
 .../_torch/auto_deploy/shim/ad_executor.py    |   1 +
 .../_torch/pyexecutor/model_engine.py         | 524 +++++++++++++++++-
 tensorrt_llm/_torch/pyexecutor/py_executor.py |  35 +-
 .../_torch/speculative/model_drafter.py       |   5 +-
 4 files changed, 546 insertions(+), 19 deletions(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
index 924bf10cff..80e8159a8a 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -695,6 +695,7 @@ class ADEngine(ModelEngine):
         new_tensors_device: Optional[torch.Tensor] = None,
         gather_context_logits: bool = False,
         cache_indirection_buffer: Optional[torch.Tensor] = None,
+        num_accepted_tokens_device: Optional[torch.Tensor] = None,
     ):
         """Run forward from scheduled requests; main entrypoint that gets called by the executor."""
         # convert requests and store in sequence info object
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 7574b8f6fd..81c65288f8 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -305,6 +305,11 @@ class PyTorchModelEngine(ModelEngine):
             raise e
 
         self.is_warmup = False
+        self.previous_request_ids = []
+        self.has_previous_device_draft = False
+        self.previous_accepted_tokens_cuda = torch.empty((self.batch_size, ),
+                                                         dtype=torch.int,
+                                                         device='cuda')
 
         self.attn_backend = get_attention_backend(
             self.llm_args.attn_backend,
@@ -378,8 +383,18 @@ class PyTorchModelEngine(ModelEngine):
 
         # Pre-allocated buffers for draft model to avoid implicit synchronization
         # These are used to build index tensors without creating tensors from Python lists
-        if is_draft_model:
-            # Buffers for context and first_draft input_ids updates
+        max_first_draft_tokens = self.batch_size * (
+            self.original_max_draft_len + 1) if spec_config else self.batch_size
+        tokens_per_draft = self.original_max_draft_len + 1
+        self.idx_accepted_tokens_cache = None
+        self.draft_token_positions_cache = None
+        if spec_config:
+            # Cache for idx_accepted_tokens (pattern: 0,0,0...1,1,1...2,2,2...)
+            self.idx_accepted_tokens_cache = torch.arange(
+                max_first_draft_tokens, dtype=torch.long,
+                device='cuda') // tokens_per_draft
+
+        if self.is_draft_model:
             self.draft_ctx_token_indices_cuda = torch.empty((self.batch_size, ),
                                                             dtype=torch.long,
                                                             device='cuda')
@@ -387,9 +402,6 @@ class PyTorchModelEngine(ModelEngine):
                                                         dtype=torch.long,
                                                         device='cuda')
             # Buffers for first_draft requests (max_draft_len+1 tokens per request)
-            max_first_draft_tokens = self.batch_size * (
-                self.original_max_draft_len +
-                1) if spec_config else self.batch_size
             self.draft_first_draft_indices_cuda = torch.empty(
                 (max_first_draft_tokens, ), dtype=torch.long, device='cuda')
             self.draft_first_draft_seq_slots_cuda = torch.empty(
@@ -401,6 +413,12 @@ class PyTorchModelEngine(ModelEngine):
             self.draft_request_indices_buffer_cuda = torch.empty(
                 (self.batch_size, ), dtype=torch.int, device='cuda')
 
+            # Pre-computed constant tensors for incremental update optimization
+            # Cache for token_positions (pattern: 0,1,2...N repeated)
+            self.draft_token_positions_cache = torch.arange(tokens_per_draft,
+                                                            dtype=torch.long,
+                                                            device='cuda')
+
         # We look up this key in resource_manager during forward to find the
         # kv cache manager. Can be changed to support multiple model engines
         # with different KV cache managers.
@@ -1328,6 +1346,482 @@ class PyTorchModelEngine(ModelEngine):
             input_ids, vocab_size=vocab_size, mm_token_ids=mm_token_ids)
         return text_token_indices, mm_token_indices
 
+    def _can_use_incremental_update(
+            self, scheduled_requests: ScheduledRequests,
+            new_tokens_device: Optional[torch.Tensor],
+            next_draft_tokens_device: Optional[torch.Tensor]) -> bool:
+        """
+        Check if we can use incremental update for the given scheduled requests and new tensors device.
+        """
+        # Not use this approach for non-speculative decoding
+        if self.spec_config is None:
+            return False
+
+        # Not allowed for one-model speculative decoding
+        if not self.spec_config.spec_dec_mode.has_draft_model():
+            return False
+
+        if not self.cuda_graph_runner.enabled:
+            return False
+
+        if self.use_mrope:
+            return False
+
+        # Not allowed for non-overlap scheduler
+        if new_tokens_device is None:
+            return False
+
+        # The changes between context and generation requests are not straightforward.
+        if len(scheduled_requests.context_requests) > 0:
+            return False
+
+        # Check if the request_ids changes
+        request_ids = [
+            request.py_request_id
+            for request in scheduled_requests.generation_requests
+        ]
+        if self.previous_request_ids != request_ids:
+            return False
+
+        has_current_device_draft = next_draft_tokens_device is not None
+        return (self.is_draft_model and self.model_is_wrapped) or (
+            has_current_device_draft and self.has_previous_device_draft)
+
+    @nvtx_range("_apply_incremental_update")
+    def _apply_incremental_update(
+            self,
+            scheduled_requests: ScheduledRequests,
+            kv_cache_manager: KVCacheManager,
+            attn_metadata: AttentionMetadata,
+            spec_metadata: Optional[SpecMetadata] = None,
+            new_tensors_device: Optional[SampleStateTensors] = None,
+            cache_indirection_buffer: Optional[torch.Tensor] = None,
+            num_accepted_tokens_device: Optional[torch.Tensor] = None,
+            req_id_to_old_request: Optional[Dict[int, LlmRequest]] = None,
+            resource_manager: Optional[ResourceManager] = None):
+        """
+        Apply incremental update for the given scheduled requests and new tensors device.
+        """
+
+        if self.is_draft_model:
+            return self._apply_incremental_update_draft(
+                scheduled_requests, kv_cache_manager, attn_metadata,
+                spec_metadata, new_tensors_device, num_accepted_tokens_device)
+        else:
+            return self._apply_incremental_update_target(
+                scheduled_requests, kv_cache_manager, attn_metadata,
+                spec_metadata, new_tensors_device, num_accepted_tokens_device)
+
+    @nvtx_range("_prepare_incremental_update_metadata")
+    def _prepare_incremental_update_metadata(
+            self,
+            scheduled_requests: ScheduledRequests,
+            kv_cache_manager: KVCacheManager,
+            attn_metadata: AttentionMetadata,
+            spec_metadata: Optional[SpecMetadata],
+            prompt_lengths: List[int],
+            num_cached_tokens_per_seq: List[int],
+            total_num_tokens: int,
+            num_generation_tokens: int,
+            request_accepted_path: Optional[Dict[int, Any]] = None,
+            num_extend_ctx_requests: int = 0):
+        """
+        Common metadata preparation logic for incremental updates.
+        """
+
+        enable_spec_decode = self.enable_spec_decode
+        enable_attention_dp = self.enable_attention_dp
+        spec_config = self.spec_config if enable_spec_decode else None
+
+        # Set up attention metadata - batch simple assignments
+        attn_metadata.beam_width = 1
+        attn_metadata.prompt_lens = prompt_lengths
+        attn_metadata.num_contexts = num_extend_ctx_requests if (
+            enable_spec_decode and spec_config.spec_dec_mode.extend_ctx(
+                self.attn_backend) and spec_config.is_linear_tree) else 0
+        attn_metadata.num_chunked_ctx_requests = attn_metadata.num_contexts
+
+        # Create KV cache params and prepare metadata
+        attn_metadata.kv_cache_params = KVCacheParams(
+            use_cache=True,
+            num_cached_tokens_per_seq=num_cached_tokens_per_seq,
+            num_extra_kv_tokens=get_num_extra_kv_tokens(spec_config))
+        attn_metadata.kv_cache_manager = kv_cache_manager
+        attn_metadata.prepare()
+
+        # Get LoRA parameters
+        lora_params = self._get_lora_params_from_requests(
+            scheduled_requests, attn_metadata)
+
+        # Handle padding for piecewise CUDA graphs
+        attn_metadata.padded_num_tokens = None
+
+        # Handle attention DP
+        if enable_attention_dp:
+            attn_metadata.all_rank_num_tokens = self._get_all_rank_num_tokens(
+                attn_metadata)
+
+        # Prepare speculative metadata
+        if spec_metadata is not None:
+            # Set request_accepted_path if Eagle3
+            if isinstance(spec_metadata, Eagle3SpecMetadata):
+                spec_metadata.request_accepted_path = request_accepted_path
+
+            spec_metadata.num_tokens = total_num_tokens
+            spec_metadata.prepare()
+
+            # Handle distributed spec metadata
+            if enable_attention_dp:
+                sequence_lengths = spec_metadata.seq_lens
+                all_rank_num_tokens = self.dist.tp_allgather(
+                    [spec_metadata.num_tokens,
+                     len(sequence_lengths)])
+                spec_metadata.all_rank_num_tokens = [
+                    item[0] for item in all_rank_num_tokens
+                ]
+                spec_metadata.all_rank_num_seqs = [
+                    item[1] for item in all_rank_num_tokens
+                ]
+
+        # Set iteration states - batch dictionary updates
+        self.iter_states.update({
+            'num_ctx_requests': 0,
+            'num_ctx_tokens': 0,
+            'num_generation_tokens': num_generation_tokens
+        })
+
+        return lora_params
+
+    @torch.compile(options={"max-autotune": True})
+    def _update_draft_input_tensors(self,
+                                    num_accepted_tokens_device: torch.Tensor,
+                                    new_tokens_device: torch.Tensor,
+                                    total_num_tokens: int,
+                                    num_first_draft_requests: int):
+        """
+        This function performs in-place updates on position_ids, num_accepted_draft_tokens,
+        gather_ids, and input_ids tensors for speculative decoding draft operations.
+        """
+        # Prepare position_ids
+        idx_accepted_tokens = self.idx_accepted_tokens_cache[:total_num_tokens]
+        self.position_ids_cuda[:total_num_tokens].add_(
+            self.num_accepted_draft_tokens_cuda[idx_accepted_tokens] + 1)
+
+        # Prepare gather_ids
+        old_accepted_tokens = self.num_accepted_draft_tokens_cuda[:
+                                                                  num_first_draft_requests].clone(
+                                                                  )
+        self.num_accepted_draft_tokens_cuda[:num_first_draft_requests].copy_(
+            num_accepted_tokens_device[
+                self.draft_seq_slots_buffer_cuda[:num_first_draft_requests]],
+            non_blocking=True)
+        self.gather_ids_cuda[:num_first_draft_requests].add_(
+            self.num_accepted_draft_tokens_cuda[:num_first_draft_requests] -
+            old_accepted_tokens)
+
+        # Prepare token_positions for input_ids update
+        tokens_per_first_draft = self.original_max_draft_len + 1
+        token_positions = self.draft_token_positions_cache[:tokens_per_first_draft].repeat(
+            num_first_draft_requests)
+
+        # Prepare input_ids
+        self.input_ids_cuda[
+            self.
+            draft_first_draft_indices_cuda[:total_num_tokens]] = new_tokens_device[
+                token_positions,
+                self.draft_first_draft_seq_slots_cuda[:total_num_tokens], 0]
+
+    def _apply_incremental_update_draft(
+            self,
+            scheduled_requests: ScheduledRequests,
+            kv_cache_manager: KVCacheManager,
+            attn_metadata: AttentionMetadata,
+            spec_metadata: Optional[SpecMetadata] = None,
+            new_tensors_device: Optional[SampleStateTensors] = None,
+            num_accepted_tokens_device: Optional[torch.Tensor] = None):
+        new_tokens_device = new_tensors_device.new_tokens
+
+        num_generation_tokens = len(scheduled_requests.generation_requests)
+        num_gen_requests = 0
+
+        tokens_per_first_draft = self.original_max_draft_len + 1
+        prompt_lengths = []  # per sequence
+        num_cached_tokens_per_seq = []  # per sequence
+
+        for request in scheduled_requests.generation_requests:
+            if request.is_dummy:
+                num_gen_requests += 1
+                past_seen_token_num = request.max_beam_num_tokens - 1
+                request.cached_tokens = past_seen_token_num
+            else:
+                assert request.py_is_first_draft
+                past_seen_token_num = request.max_beam_num_tokens - tokens_per_first_draft
+
+            num_cached_tokens_per_seq.append(past_seen_token_num)
+            prompt_lengths.append(request.py_prompt_len)
+            request.py_batch_idx = request.py_seq_slot
+
+        num_first_draft_requests = num_generation_tokens - num_gen_requests
+        total_num_tokens = num_first_draft_requests * tokens_per_first_draft
+
+        self._update_draft_input_tensors(
+            num_accepted_tokens_device=num_accepted_tokens_device,
+            new_tokens_device=new_tokens_device,
+            total_num_tokens=total_num_tokens,
+            num_first_draft_requests=num_first_draft_requests)
+
+        # Prepare spec_metadata
+        if spec_metadata is not None:
+            spec_metadata.draft_tokens = []
+            spec_metadata.gather_ids = self.gather_ids_cuda[:
+                                                            num_generation_tokens]
+            spec_metadata.num_accepted_draft_tokens = self.num_accepted_draft_tokens_cuda[:
+                                                                                          num_generation_tokens]
+
+        # Use common metadata preparation logic
+        virtual_num_tokens = total_num_tokens + num_gen_requests
+        lora_params = self._prepare_incremental_update_metadata(
+            scheduled_requests=scheduled_requests,
+            kv_cache_manager=kv_cache_manager,
+            attn_metadata=attn_metadata,
+            spec_metadata=spec_metadata,
+            prompt_lengths=prompt_lengths,
+            num_cached_tokens_per_seq=num_cached_tokens_per_seq,
+            total_num_tokens=virtual_num_tokens,
+            num_generation_tokens=num_generation_tokens,
+            num_extend_ctx_requests=0)
+
+        # No padding because there are only generation requests.
+        attn_metadata.padded_num_tokens = None
+        if self.enable_attention_dp:
+            attn_metadata.all_rank_num_tokens = self._get_all_rank_num_tokens(
+                attn_metadata)
+
+        final_position_ids = self.position_ids_cuda[:
+                                                    virtual_num_tokens].unsqueeze(
+                                                        0)
+
+        inputs = {
+            'attn_metadata': attn_metadata,
+            'input_ids': self.input_ids_cuda[:virtual_num_tokens],
+            'position_ids': final_position_ids,
+            'inputs_embeds': None,
+            "multimodal_params": [],
+        }
+
+        if bool(lora_params):
+            inputs['lora_params'] = lora_params
+
+        if spec_metadata is not None:
+            inputs['spec_metadata'] = spec_metadata
+
+        return inputs, self.gather_ids_cuda[:num_generation_tokens]
+
+    @torch.compile(options={"max-autotune": True})
+    def _update_target_input_tensors(
+            self, num_accepted_tokens_device: torch.Tensor,
+            new_tokens_device: torch.Tensor,
+            next_draft_tokens_device: torch.Tensor,
+            new_tokens_lens_device: torch.Tensor, previous_slots: torch.Tensor,
+            total_num_tokens: int, num_extend_reqeust_wo_dummy: int,
+            num_tokens_per_extend_request: int,
+            previous_batch_draft_tokens: int):
+        """
+        This function performs in-place updates on position_ids, num_accepted_draft_tokens,
+        input_ids, draft_tokens, and offset tensors for speculative decoding extend context operations.
+        """
+
+        # Prepare position_ids
+        idx_accepted_tokens = self.idx_accepted_tokens_cache[:total_num_tokens]
+        self.position_ids_cuda[:total_num_tokens].add_(
+            self.num_accepted_draft_tokens_cuda[idx_accepted_tokens] + 1)
+
+        self.num_accepted_draft_tokens_cuda[:num_extend_reqeust_wo_dummy].copy_(
+            num_accepted_tokens_device[:num_extend_reqeust_wo_dummy],
+            non_blocking=True)
+
+        # Initialize offset tensors to zeros
+        self.previous_pos_id_offsets_cuda.mul_(0)
+        self.previous_kv_lens_offsets_cuda.mul_(0)
+
+        # Prepare input_ids
+        new_tokens = new_tokens_device.transpose(
+            0, 1)[previous_slots, :].flatten()
+        self.input_ids_cuda[:total_num_tokens].copy_(new_tokens,
+                                                     non_blocking=True)
+
+        # Prepare draft tokens
+        self.draft_tokens_cuda[:previous_batch_draft_tokens].copy_(
+            next_draft_tokens_device[previous_slots, :].flatten(),
+            non_blocking=True)
+
+        # Compute kv_len_offsets and update offset tensors
+        previous_pos_indices = previous_slots.repeat_interleave(
+            num_tokens_per_extend_request)
+        self.previous_pos_indices_cuda[:total_num_tokens].copy_(
+            previous_pos_indices, non_blocking=True)
+        kv_len_offsets_device = new_tokens_lens_device - num_tokens_per_extend_request
+        self.previous_pos_id_offsets_cuda[:num_extend_reqeust_wo_dummy *
+                                          num_tokens_per_extend_request].copy_(
+                                              new_tokens_lens_device[
+                                                  self.
+                                                  previous_pos_indices_cuda[:
+                                                                            total_num_tokens]],
+                                              non_blocking=True)
+        self.previous_kv_lens_offsets_cuda[:num_extend_reqeust_wo_dummy].copy_(
+            kv_len_offsets_device[previous_slots], non_blocking=True)
+
+    def _apply_incremental_update_target(
+            self,
+            scheduled_requests: ScheduledRequests,
+            kv_cache_manager: KVCacheManager,
+            attn_metadata: AttentionMetadata,
+            spec_metadata: Optional[SpecMetadata] = None,
+            new_tensors_device: Optional[SampleStateTensors] = None,
+            num_accepted_tokens_device: Optional[torch.Tensor] = None):
+        # Extract tensors from new_tensors_device
+        new_tokens_device = new_tensors_device.new_tokens  # [batch, 1 + draft_len]
+        new_tokens_lens_device = new_tensors_device.new_tokens_lens  # [batch]
+        next_draft_tokens_device = new_tensors_device.next_draft_tokens  # [batch, draft_len]
+
+        # Pre-compute constants
+        extend_requests = scheduled_requests.generation_requests
+        num_extend_requests = len(extend_requests)
+        num_tokens_per_extend_request = self.original_max_draft_len + 1
+        spec_config = self.spec_config
+
+        prompt_lengths = torch.empty(num_extend_requests,
+                                     dtype=torch.int,
+                                     device='cpu',
+                                     pin_memory=True)
+        num_cached_tokens_per_seq = torch.empty(num_extend_requests,
+                                                dtype=torch.int,
+                                                device='cpu',
+                                                pin_memory=True)
+        previous_batch_indices = torch.empty(num_extend_requests,
+                                             dtype=torch.int,
+                                             device='cpu',
+                                             pin_memory=True)
+
+        request_accepted_path = {}
+        num_extend_dummy_requests = 0
+        num_previous_batch = 0
+
+        use_extend_ctx = (self.enable_spec_decode
+                          and spec_config.spec_dec_mode.extend_ctx(
+                              self.attn_backend) and spec_config.is_linear_tree)
+
+        for idx, request in enumerate(extend_requests):
+            request_accepted_path[request.py_request_id] = \
+                request.py_num_accepted_draft_tokens_indices
+
+            base_past_seen = request.max_beam_num_tokens - 1
+
+            if use_extend_ctx:
+                # We're treating the prompt lengths as context requests here, so
+                # the prompt lens should not include the cached tokens.
+                prompt_lengths[idx] = num_tokens_per_extend_request
+            else:
+                prompt_lengths[idx] = request.py_prompt_len
+
+            if request.is_dummy:
+                num_cached_tokens_per_seq[idx] = base_past_seen
+                request.cached_tokens = base_past_seen
+                num_extend_dummy_requests += 1
+            else:
+                # Request has previous tensor
+                previous_batch_indices[
+                    num_previous_batch] = request.py_batch_idx
+                num_previous_batch += 1
+
+                num_cached_tokens_per_seq[
+                    idx] = base_past_seen + num_tokens_per_extend_request
+                request.cached_tokens = num_cached_tokens_per_seq[idx].item()
+
+            request.py_batch_idx = request.py_seq_slot
+
+        num_extend_reqeust_wo_dummy = num_extend_requests - num_extend_dummy_requests
+        total_num_tokens = num_extend_reqeust_wo_dummy * num_tokens_per_extend_request
+
+        previous_slots = self.previous_batch_indices_cuda[:num_previous_batch]
+        previous_slots.copy_(previous_batch_indices[:num_previous_batch],
+                             non_blocking=True)
+
+        prompt_lengths = prompt_lengths.tolist()
+        num_cached_tokens_per_seq = num_cached_tokens_per_seq.tolist()
+
+        previous_batch_draft_tokens = num_extend_reqeust_wo_dummy * self.runtime_draft_len
+
+        self._update_target_input_tensors(
+            num_accepted_tokens_device=num_accepted_tokens_device,
+            new_tokens_device=new_tokens_device,
+            next_draft_tokens_device=next_draft_tokens_device,
+            new_tokens_lens_device=new_tokens_lens_device,
+            previous_slots=previous_slots,
+            total_num_tokens=total_num_tokens,
+            num_extend_reqeust_wo_dummy=num_extend_reqeust_wo_dummy,
+            num_tokens_per_extend_request=num_tokens_per_extend_request,
+            previous_batch_draft_tokens=previous_batch_draft_tokens)
+
+        # Prepare spec_metadata
+        num_generation_tokens = num_extend_requests * num_tokens_per_extend_request
+        if spec_metadata is not None:
+            total_draft_lens = self.max_total_draft_tokens * num_extend_requests
+            spec_metadata.draft_tokens = self.draft_tokens_cuda[:
+                                                                total_draft_lens]
+            spec_metadata.gather_ids = self.gather_ids_cuda[:total_num_tokens]
+            spec_metadata.num_accepted_draft_tokens = self.num_accepted_draft_tokens_cuda[:
+                                                                                          num_extend_requests]
+
+        # Determine if we're using extend_ctx mode for linear tree decoding
+        num_extend_ctx_requests = 0
+        if self.enable_spec_decode and spec_config.spec_dec_mode.extend_ctx(
+                self.attn_backend) and spec_config.is_linear_tree:
+            num_extend_ctx_requests = num_extend_requests
+
+        virtual_num_tokens = num_generation_tokens
+        lora_params = self._prepare_incremental_update_metadata(
+            scheduled_requests=scheduled_requests,
+            kv_cache_manager=kv_cache_manager,
+            attn_metadata=attn_metadata,
+            spec_metadata=spec_metadata,
+            prompt_lengths=prompt_lengths,
+            num_cached_tokens_per_seq=num_cached_tokens_per_seq,
+            total_num_tokens=virtual_num_tokens,
+            num_generation_tokens=num_generation_tokens,
+            request_accepted_path=request_accepted_path,
+            num_extend_ctx_requests=num_extend_ctx_requests)
+
+        # No padding because there are only generation requests.
+        attn_metadata.padded_num_tokens = None
+        if self.enable_attention_dp:
+            attn_metadata.all_rank_num_tokens = self._get_all_rank_num_tokens(
+                attn_metadata)
+
+        final_position_ids = self.position_ids_cuda[:
+                                                    virtual_num_tokens].unsqueeze(
+                                                        0)
+
+        # Prepare inputs
+        # Note: multimodal_params is always empty for incremental updates because:
+        # - This function only processes generation requests (no context requests)
+        # - Multimodal data (images/videos) is only needed during context/prefill phase
+        inputs = {
+            'attn_metadata': attn_metadata,
+            'input_ids': self.input_ids_cuda[:virtual_num_tokens],
+            'position_ids': final_position_ids,
+            'inputs_embeds': None,
+            "multimodal_params": [],
+        }
+
+        if bool(lora_params):
+            inputs['lora_params'] = lora_params
+
+        if spec_metadata is not None:
+            inputs['spec_metadata'] = spec_metadata
+
+        return inputs, self.gather_ids_cuda[:num_generation_tokens]
+
     def _prepare_tp_inputs(
             self,
             scheduled_requests: ScheduledRequests,
@@ -1358,6 +1852,15 @@ class PyTorchModelEngine(ModelEngine):
             self.guided_decoder.add_batch(scheduled_requests,
                                           new_tokens=new_tokens_device)
 
+        if self._can_use_incremental_update(scheduled_requests,
+                                            new_tokens_device,
+                                            next_draft_tokens_device):
+            return self._apply_incremental_update(
+                scheduled_requests, kv_cache_manager, attn_metadata,
+                spec_metadata, new_tensors_device, cache_indirection_buffer,
+                num_accepted_tokens_device, req_id_to_old_request,
+                resource_manager)
+
         # if new_tensors_device exist, input_ids will only contain new context tokens
         input_ids = []  # per sequence
         sequence_lengths = []  # per sequence
@@ -1564,7 +2067,8 @@ class PyTorchModelEngine(ModelEngine):
                 # overlap scheduler can only support the speculative decoding
                 # methods with a fixed number of draft tokens
                 sequence_lengths.append(1 + self.runtime_draft_len)
-                num_accepted_draft_tokens.append(self.runtime_draft_len)
+                num_accepted_draft_tokens.append(
+                    request.py_num_accepted_draft_tokens)
                 past_seen_token_num = request.max_beam_num_tokens - 1
                 draft_lens.append(self.runtime_draft_len)
                 gather_ids.extend(
@@ -2146,6 +2650,14 @@ class PyTorchModelEngine(ModelEngine):
         self.iter_states['num_ctx_requests'] = num_ctx_requests
         self.iter_states['num_ctx_tokens'] = num_ctx_tokens
         self.iter_states['num_generation_tokens'] = num_generation_tokens
+
+        if not self.is_warmup:
+            self.previous_request_ids = [
+                request.py_request_id
+                for request in scheduled_requests.generation_requests
+            ]
+            self.has_previous_device_draft = next_draft_tokens_device is not None
+
         return inputs, self.gather_ids_cuda[:len(
             gather_ids)] if self.enable_spec_decode else None
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 0f2d06a007..6ccae36fdc 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1601,9 +1601,10 @@ class PyExecutor:
                     # When there's any accepted tokens, we can't directly use the previous batch's outputs in this iteration for the target model,
                     # so we'll set the target model's input to None and skip updating the target requests after target model forward.
                     use_previous_draft_tokens = self.has_previous_draft_tokens
+                    num_accepted_tokens_device = None
                     if self.drafter is not None and (self.use_spec_decode or
                                                      use_previous_draft_tokens):
-                        target_inputs = self._handle_speculative_decoding(
+                        target_inputs, num_accepted_tokens_device = self._handle_speculative_decoding(
                             scheduled_batch, previous_tensors,
                             previous_tensors_device)
 
@@ -1616,8 +1617,9 @@ class PyExecutor:
                     else:
                         previous_tensors_device = self.previous_batch and self.previous_batch.sample_state and self.previous_batch.sample_state.device
 
-                    batch_outputs = self._forward_step(scheduled_batch,
-                                                       previous_tensors_device)
+                    batch_outputs = self._forward_step(
+                        scheduled_batch, previous_tensors_device,
+                        num_accepted_tokens_device)
 
                 if self.previous_batch is not None:
                     self._update_requests(self.previous_batch.sample_state)
@@ -1684,6 +1686,8 @@ class PyExecutor:
 
                 self.iter_counter += 1
 
+    @nvtx_range("_accept_draft_tokens")
+    @torch.compile(options={"max-autotune": True})
     def _accept_draft_tokens(
         self, scheduled_batch: ScheduledRequests,
         target_outputs: SampleStateTensors,
@@ -2184,22 +2188,26 @@ class PyExecutor:
         self.kv_cache_transceiver.check_gen_transfer_status(atLeastNum)
         self._check_cache_transfer_errors("generation requests")
 
-    def _forward_step(self,
-                      scheduled_requests,
-                      new_tensors_device: Optional[SampleStateTensors] = None):
+    def _forward_step(
+            self,
+            scheduled_requests,
+            new_tensors_device: Optional[SampleStateTensors] = None,
+            num_accepted_tokens_device: Optional[torch.Tensor] = None):
         ExpertStatistic.set_iter(self.iter_counter)
 
         @nvtx_range(
             f"[Executor] _forward_step {self.iter_counter}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs"
         )
         def forward(scheduled_requests, resource_manager, new_tensors_device,
-                    gather_context_logits, cache_indirection_buffer):
+                    gather_context_logits, cache_indirection_buffer,
+                    num_accepted_tokens_device):
             return self.model_engine.forward(
                 scheduled_requests,
                 resource_manager,
                 new_tensors_device,
                 gather_context_logits=gather_context_logits,
-                cache_indirection_buffer=cache_indirection_buffer)
+                cache_indirection_buffer=cache_indirection_buffer,
+                num_accepted_tokens_device=num_accepted_tokens_device)
 
         try:
             gather_context_logits = any(
@@ -2208,7 +2216,8 @@ class PyExecutor:
             cache_indirection_buffer = self.sampler.get_cache_indirection()
             outputs = forward(scheduled_requests, self.resource_manager,
                               new_tensors_device, gather_context_logits,
-                              cache_indirection_buffer)
+                              cache_indirection_buffer,
+                              num_accepted_tokens_device)
 
             self._kv_connector_wait_for_save()
 
@@ -2732,8 +2741,9 @@ class PyExecutor:
             )
             self.inflight_req_ids.erase(req.request_id)
 
-    def _handle_speculative_decoding(self, scheduled_batch, previous_tensors,
-                                     target_inputs):
+    def _handle_speculative_decoding(
+        self, scheduled_batch, previous_tensors, target_inputs
+    ) -> Tuple[Optional[SampleStateTensorsMTP], Optional[torch.Tensor]]:
         with request_context(is_draft=self.draft_model_engine is not None,
                              scheduled_requests=scheduled_batch):
             # Do an early checking to see if we need to forward the draft model.
@@ -2744,6 +2754,7 @@ class PyExecutor:
                 and self.drafter.should_forward_draft_model(scheduled_batch))
 
             new_target_inputs = None
+            num_accepted_tokens_device = None
             if has_draft_batch:
                 target_outputs = self.previous_batch.sample_state and self.previous_batch.sample_state.device
                 assert target_outputs is not None, "target_outputs should not be None"
@@ -2774,7 +2785,7 @@ class PyExecutor:
                     for request in scheduled_batch.all_requests():
                         request.py_draft_tokens = []
 
-        return new_target_inputs
+        return new_target_inputs, num_accepted_tokens_device
 
     def reset_prefix_cache(self):
         self.kv_cache_manager.reset_reuse_state()
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
index 0570b8107b..5b8c23e000 100644
--- a/tensorrt_llm/_torch/speculative/model_drafter.py
+++ b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -533,10 +533,13 @@ class ModelDrafter(Drafter):
 
         if has_draft_tokens:
             # We already updated the target state, so the new_tokens_lens should be all ones.
-            new_tokens_lens = torch.ones(batch_size, device=device)
+            new_tokens_lens = torch.ones(batch_size,
+                                         dtype=torch.int,
+                                         device=device)
             new_tokens_lens += num_accepted_tokens_device
             next_draft_tokens = torch.zeros(batch_size,
                                             self.max_draft_len,
+                                            dtype=torch.int,
                                             device=device)
         target_inputs.new_tokens_lens = new_tokens_lens
         target_inputs.next_draft_tokens = next_draft_tokens

From 356ad4fe3a6024e2a610008b41295c6bccf54f69 Mon Sep 17 00:00:00 2001
From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:34:04 +0800
Subject: [PATCH 09/25] [https://nvbugs/5722653][fix] Address port conflict by
 assigning different port section in the same node. (#10035)

Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
---
 jenkins/L0_Test.groovy                        | 78 ++++++++++++++++++-
 tests/integration/defs/common.py              | 44 ++++++++++-
 tests/integration/defs/test_e2e.py            | 12 +++
 .../integration/test_lists/test-db/l0_a10.yml |  1 +
 4 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index c106894a16..5f3f5add62 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -770,7 +770,9 @@ def getPytestBaseCommandLine(
     String trtllmWheelPath,
     String coverageConfigFile,
     String pytestUtil = "",
-    List<String> extraArgs = []
+    List<String> extraArgs = [],
+    int containerPortStart = 0,
+    int containerPortNum = 0
 ) {
     def extraInternalEnv = ""
     def pytestTestTimeout = "3600"
@@ -782,6 +784,12 @@ def getPytestBaseCommandLine(
     // Enable NCCL debug information for multi-GPU tests
     extraInternalEnv += " NCCL_DEBUG=INFO"
 
+    // Container port allocation environment variables for avoiding port conflicts
+    def portEnvVars = ""
+    if (containerPortStart > 0 && containerPortNum > 0) {
+        portEnvVars = "CONTAINER_PORT_START=${containerPortStart} CONTAINER_PORT_NUM=${containerPortNum}"
+    }
+
     def testCmdLine = [
         "LLM_ROOT=${llmSrc}",
         "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
@@ -789,6 +797,7 @@ def getPytestBaseCommandLine(
         "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
         "COLUMNS=200",
         extraInternalEnv,
+        portEnvVars,
         pytestUtil,
         "pytest",
         "-v",
@@ -1264,6 +1273,61 @@ def globalVars = [
 
 class GlobalState {
     static def uploadResultStageNames = []
+
+    // HOST_NODE_NAME to starting port section map
+    // This map maintains the next available starting port for each host node
+    // to avoid port conflicts when running parallel tests on the same node.
+    // Key: HOST_NODE_NAME (e.g., "node-01.cluster.local")
+    // Value: Next available starting port number for that node
+    static def hostNodePortMap = [:]
+
+    // Port allocation configuration
+    static final int BASE_PORT = 10000           // Base starting port
+    static final int PORT_SECTION_SIZE = 1000    // Number of ports per section/stage
+    static final int MAX_PORT = 32000            // Maximum port number to avoid system ports
+}
+
+/**
+ * Allocates and returns a starting port section for the given host node.
+ * This function is thread-safe and ensures each stage running on the same
+ * host node gets a unique port range to avoid conflicts.
+ *
+ * @param hostNodeName The HOST_NODE_NAME of the node running the stage
+ * @param stageName Optional stage name for logging purposes
+ * @return The starting port number for this stage's port section
+ */
+def getStartingPortForHost(String hostNodeName, String stageName = "") {
+    lock(resource: 'globalstate-hostNodePortMap') {
+        def startingPort = GlobalState.hostNodePortMap.get(hostNodeName, GlobalState.BASE_PORT)
+
+        // Store the next available starting port for this host
+        def nextPort = startingPort + GlobalState.PORT_SECTION_SIZE
+
+        // Wrap around if we exceed MAX_PORT
+        if (nextPort > GlobalState.MAX_PORT) {
+            nextPort = GlobalState.BASE_PORT
+        }
+
+        GlobalState.hostNodePortMap[hostNodeName] = nextPort
+
+        return startingPort
+    }
+}
+
+/**
+ * Gets the HOST_NODE_NAME from the current environment.
+ * Falls back to hostname if HOST_NODE_NAME is not set.
+ *
+ * @return The host node name
+ */
+def getHostNodeName() {
+    return sh(script: '''
+        if [ -n "$HOST_NODE_NAME" ]; then
+            echo "$HOST_NODE_NAME"
+        else
+            hostname -f || hostname
+        fi
+    ''', returnStdout: true).trim()
 }
 
 String getShortenedJobName(String path)
@@ -2449,6 +2513,12 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
             cat ${coverageConfigFile}
         """
         echoNodeAndGpuInfo(pipeline, stageName)
+
+        // Allocate a unique port section for this container to avoid port conflicts
+        def hostNodeName = getHostNodeName()
+        def containerPortStart = getStartingPortForHost(hostNodeName, stageName)
+        def containerPortNum = GlobalState.PORT_SECTION_SIZE
+
         // Some clusters do not allow dmesg -C so we add || true
         sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
         def pytestCommand = getPytestBaseCommandLine(
@@ -2458,7 +2528,11 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
             perfMode,
             "${WORKSPACE}/${stageName}",
             TRTLLM_WHL_PATH,
-            coverageConfigFile
+            coverageConfigFile,
+            "",  // pytestUtil
+            [],  // extraArgs
+            containerPortStart,
+            containerPortNum
         )
 
         // Only add --test-list if there are regular tests to run
diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py
index 5ff8799fb3..c79fc4ed61 100644
--- a/tests/integration/defs/common.py
+++ b/tests/integration/defs/common.py
@@ -1153,17 +1153,57 @@ def wait_for_server(host, port, timeout_seconds=180):
     return False
 
 
+PORTS_IN_USE = set()
+
+
+def get_free_port_in_ci(max_attempts=100):
+    """
+    Get a free port in the range [CONTAINER_PORT_START, CONTAINER_PORT_START + CONTAINER_PORT_NUM - 1]
+    If CONTAINER_PORT_START and CONTAINER_PORT_NUM are not set or all ports are already in use, fallback to get_free_port
+    """
+    container_port_start = int(os.environ.get("CONTAINER_PORT_START", -1))
+    container_port_num = int(os.environ.get("CONTAINER_PORT_NUM", -1))
+    if container_port_start != -1 and container_port_num != -1:
+        for i in range(container_port_num):
+            port = container_port_start + i
+            if port in PORTS_IN_USE:
+                continue
+
+            # Check if the port is free
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                try:
+                    s.bind(("localhost", port))
+
+                    # Port is free, add it to the set of used ports
+                    PORTS_IN_USE.add(port)
+                    return port
+                except OSError:
+                    # Port is not free, try the next port
+                    continue
+
+    # No port found in the range, try to get a random free port from the system
+    for i in range(max_attempts):
+        port = get_free_port()
+        if port not in PORTS_IN_USE:
+            PORTS_IN_USE.add(port)
+            return port
+
+    raise Exception(
+        f"Failed to find a free port both in container port range and system after {max_attempts} attempts"
+    )
+
+
 def revise_disaggregated_server_config_urls_with_free_ports(
         disaggregated_server_config: dict[str, Any]) -> dict[str, Any]:
     # Revise serve port
-    disaggregated_server_config['port'] = get_free_port()
+    disaggregated_server_config['port'] = get_free_port_in_ci()
 
     # Revise context and generation server urls
     ctx_urls = disaggregated_server_config["context_servers"]["urls"]
     gen_urls = disaggregated_server_config["generation_servers"]["urls"]
     url_map = dict()
     for url in set(ctx_urls + gen_urls):
-        url_map[url] = (url.split(':')[0], get_free_port())
+        url_map[url] = (url.split(':')[0], get_free_port_in_ci())
 
     for i, url in enumerate(ctx_urls):
         disaggregated_server_config["context_servers"]["urls"][
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 9c94f815da..b75781bf0f 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -3369,3 +3369,15 @@ def test_eagle3_output_consistency_4gpus(model_dir: str, draft_model_dir: str):
         f"Eagle3 output contains repetitive characters: {output_spec[:500]}")
     assert not repetitive_pattern.search(output_ref), (
         f"Baseline output contains repetitive characters: {output_ref[:500]}")
+
+
+def test_get_ci_container_port():
+    container_port_start = os.environ.get("CONTAINER_PORT_START", None)
+    container_port_num = os.environ.get("CONTAINER_PORT_NUM", None)
+    assert container_port_start is not None
+    assert container_port_num is not None
+    container_port_start = int(container_port_start)
+    container_port_num = int(container_port_num)
+    assert container_port_start > 0
+    assert container_port_num > 0
+    assert container_port_start + container_port_num <= 60000
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index 4f986b3e2d..03a7b53b5a 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -53,6 +53,7 @@ l0_a10:
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-False-TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-True-TinyLlama-1.1B-Chat-v1.0]
   - test_e2e.py::test_openai_chat_guided_decoding
+  - test_e2e.py::test_get_ci_container_port
   - test_e2e.py::test_openai_chat_multimodal_example ISOLATION
   - test_e2e.py::test_openai_mmencoder_example
   - test_e2e.py::test_openai_perf_metrics

From cb0444b1b5ac85b590c8948d1e87c3877b5c0dc3 Mon Sep 17 00:00:00 2001
From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
Date: Fri, 19 Dec 2025 16:07:56 +0800
Subject: [PATCH 10/25] [TRTLLM-8638][fix] Add failed cases into waives.txt
 (#10132)

Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
Co-authored-by: Larry Xu <197874197+LarryXFly@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 4f6d8e75a4..a5839449ff 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -480,3 +480,15 @@ unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/57
 unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521)
 unittest/llmapi/apps/_test_openai_responses.py::test_reasoning_effort[DeepSeek-R1-Distill-Qwen-1.5B] SKIP (https://nvbugs/5753250)
 unittest/llmapi/apps/_test_openai_responses.py::test_multi_turn_chat[Qwen3/Qwen3-0.6B] SKIP (https://nvbugs/5753250)
+examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
+examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979)
+examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
+examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
+examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
+examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
+examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
+examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
+examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5754976)
+examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (https://nvbugs/5754976)
+examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5568052)
+accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/5588376)

From 52cee573adad71d38b6ec95d3af5c951cf46c7b0 Mon Sep 17 00:00:00 2001
From: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Date: Fri, 19 Dec 2025 17:01:52 +0800
Subject: [PATCH 11/25] [TRTLLM-8830][test] Overlap scheduler enhancement perf
 test: Add qwen3_0,8b and llama3.1 test cases (#10114)

Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
---
 tests/integration/defs/perf/test_perf.py      |   1 +
 .../test_lists/qa/llm_perf_core.yml           |   1 -
 .../test_lists/qa/llm_perf_sanity.yml         | 130 ++++++------------
 3 files changed, 46 insertions(+), 86 deletions(-)

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 82891ca847..f6d81460fe 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -111,6 +111,7 @@ MODEL_PATH_DICT = {
     "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
     "qwen2_7b_instruct": "Qwen2-7B-Instruct",
     "qwen_14b_chat": "Qwen-14B-Chat",
+    "qwen3_0.6b": "Qwen3/Qwen3-0.6B",
     "qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
     "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
     "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml
index b8f8b1f222..2db0b307b2 100644
--- a/tests/integration/test_lists/qa/llm_perf_core.yml
+++ b/tests/integration/test_lists/qa/llm_perf_core.yml
@@ -94,7 +94,6 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:1000,2000-reqs:500-con:200]
   - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,2000-reqs:500-con:200]
     #llama_v3.1_8b
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-mp-maxbs:256-input_output_len:128,128-pp:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
   #mixtral_8x7b_v0.1
diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml
index 069bd02ea2..0348cef095 100644
--- a/tests/integration/test_lists/qa/llm_perf_sanity.yml
+++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml
@@ -1,6 +1,18 @@
 version: 0.0.1
 llm_perf_sanity:
-# A100, L40S, L20, H20, H100, H200, Blackwell
+# ===============================================================================
+# Test Conditions Index
+# ===============================================================================
+# 1: All GPUs
+# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
+# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
+# 6: H20, H100, H200, B200, B300, RTX6000-Server
+# 7: H20, H100, H200, B200, B300
+# ===============================================================================
+
+# 1: All GPUs
 - condition:
     ranges:
       system_gpu_count:
@@ -28,10 +40,13 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000]
   - perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512]
   - perf/test_perf.py::test_perf[qwen3_4b_eagle3-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
 
 
-# FP8 specific tests
-# A100, L40S, L20, H20, H100, H200, Blackwell
+# 2: L40S, L20, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
 - condition:
     terms:
       supports_fp8: true
@@ -45,94 +60,40 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-reqs:8-con:1]
   - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250]
   - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
-
-# Tests for ALL systems with 2+ GPUs
-# A100, L40S, L20, H20, H100, H200, Blackwell
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 2
-  tests:
-  #llama_v3.1_8b_instruct
-  #pytorch backend
-  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-maxnt:256-input_output_len:128,128-reqs:10-gpus:2]
-  - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
-
-# FP8 tests for systems with 2+ GPUs
-# A100, L40S, L20, H20, H100, H200, Blackwell
-- condition:
-    terms:
-      supports_fp8: true
-    ranges:
-      system_gpu_count:
-        gte: 2
-
-  tests:
-  #mixtral_8x7b_v0.1_fp8 pytorch backend
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
 
-# Tests for systems with 2+ GPUs and high memory
-# A100, L40S, H20, H100, H200, Blackwell
+
+# 3: A100, H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
     ranges:
-      system_gpu_count:
-        gte: 2
       gpu_memory:
         gt: 80000
-
   tests:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2]
-
-# Tests for systems with 4+ GPUs
-# A100, L40S, H20, H100, H200, Blackwell
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 4
-
-  tests:
-  #llama_v3.1_70b
-  #trt backend
-  #pytorch backend
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
-
-# FP8 specific tests
-# L40S, H20, H100, H200, Blackwell
-- condition:
-    terms:
-      supports_fp8: true
-    ranges:
-      system_gpu_count:
-        gte: 4
-
-  tests:
-  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
-  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
-
-# Tests for systems with 8+ GPUs
-# A100, L40S, H20, H100, H200, Blackwell
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 8
-      gpu_memory:
-        gt: 46000
-
-  tests:
-  #llama_v3.1_70b
-  #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
 
-# FP8 tests for systems with 8+ GPUs
-# L40S, H20, H100, H200, Blackwell
+# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+
+  tests:
+    # llama_v3.1_70b
+    # trt backend
+    # pytorch backend
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
+   # test overlap scheduler
+  - perf/test_perf.py::test_perf[qwen3_0.6b-bench-pytorch-bfloat16-maxnt:2048-input_output_len:8000,1000-reqs:256-con:1-pp:4-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:1000,1000-reqs:1000-con:200]
+
+
+# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
     terms:
       supports_fp8: true
@@ -141,17 +102,15 @@ llm_perf_sanity:
         gte: 8
 
   tests:
-  #llama_v3.1_70b
-  #trt backend
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:4-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-maxnt:544-input_output_len:512,32-quant:fp8-gpus:8]
-  #llama_v3.3_70b_instruct_fp8
-  #pytorch backend
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8]
 
 
-# FP4, FP8 tests for systems with 8+ GPUs
-# H20, H100, H200, Blackwell
+# 6: H20, H100, H200, B200, B300, RTX6000-Server
 - condition:
     ranges:
       system_gpu_count:
@@ -171,10 +130,11 @@ llm_perf_sanity:
   - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
   - perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
-    # gpt_oss_20b_fp4
+  # gpt_oss_20b_fp4
   - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
 
-# gpu_arch > Hopper, exclude GB20X, RTX 6000 for not supported
+
+# 7: H20, H100, H200, B200, B300
 - condition:
     ranges:
       system_gpu_count:

From 31bc14b3507d749b183722e3c0de38f588e725fc Mon Sep 17 00:00:00 2001
From: Chang Liu <9713593+chang-l@users.noreply.github.com>
Date: Fri, 19 Dec 2025 01:05:38 -0800
Subject: [PATCH 12/25] [TRTLLM-9654][feat] Support DeepSeek-V32 chat template
 (#9814)

Signed-off-by: Chang Liu (Enterprise Products) <9713593+chang-l@users.noreply.github.com>
---
 pyproject.toml                                |   1 +
 tensorrt_llm/commands/eval.py                 |  23 +-
 tensorrt_llm/commands/serve.py                |  43 +-
 tensorrt_llm/inputs/utils.py                  |  15 +-
 tensorrt_llm/llmapi/llm_args.py               |  43 ++
 tensorrt_llm/llmapi/tokenizer.py              | 384 +---------------
 tensorrt_llm/tokenizer/__init__.py            |  21 +
 .../tokenizer/deepseek_v32/__init__.py        |  14 +
 .../tokenizer/deepseek_v32/encoding.py        | 425 ++++++++++++++++++
 .../tokenizer/deepseek_v32/tokenizer.py       | 147 ++++++
 tensorrt_llm/tokenizer/tokenizer.py           | 365 +++++++++++++++
 .../api_stability/references/llm.yaml         |   4 +
 12 files changed, 1097 insertions(+), 388 deletions(-)
 create mode 100644 tensorrt_llm/tokenizer/__init__.py
 create mode 100644 tensorrt_llm/tokenizer/deepseek_v32/__init__.py
 create mode 100644 tensorrt_llm/tokenizer/deepseek_v32/encoding.py
 create mode 100644 tensorrt_llm/tokenizer/deepseek_v32/tokenizer.py
 create mode 100644 tensorrt_llm/tokenizer/tokenizer.py

diff --git a/pyproject.toml b/pyproject.toml
index b453d975e3..031d41850d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -848,6 +848,7 @@ exclude = [
     "tensorrt_llm/serve/tool_parser/base_tool_parser.py",
     "tensorrt_llm/serve/tool_parser/qwen3_tool_parser.py",
     "tensorrt_llm/serve/tool_parser/utils.py",
+    "tensorrt_llm/tokenizer/tokenizer.py",
     "tensorrt_llm/tools/__init__.py",
     "tensorrt_llm/tools/importlib_utils.py",
     "tensorrt_llm/tools/multimodal_builder.py",
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
index d849a7c91a..76c6c63eb8 100644
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@@ -39,6 +39,14 @@ from ..logger import logger, severity_map
               default=None,
               help="Path | Name of the tokenizer."
               "Specify this value only if using TensorRT engine as model.")
+@click.option(
+    "--custom_tokenizer",
+    type=str,
+    default=None,
+    help=
+    "Custom tokenizer type: alias (e.g., 'deepseek_v32') or Python import path "
+    "(e.g., 'tensorrt_llm.tokenizer.deepseek_v32.DeepseekV32Tokenizer'). [Experimental]"
+)
 @click.option(
     "--backend",
     type=click.Choice(["pytorch", "tensorrt"]),
@@ -109,13 +117,13 @@ from ..logger import logger, severity_map
               default=False,
               help="Flag for disabling KV cache reuse.")
 @click.pass_context
-def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
-         backend: str, max_beam_width: int, max_batch_size: int,
-         max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
-         ep_size: Optional[int], gpus_per_node: Optional[int],
-         kv_cache_free_gpu_memory_fraction: float, trust_remote_code: bool,
-         revision: Optional[str], extra_llm_api_options: Optional[str],
-         disable_kv_cache_reuse: bool):
+def main(ctx, model: str, tokenizer: Optional[str],
+         custom_tokenizer: Optional[str], log_level: str, backend: str,
+         max_beam_width: int, max_batch_size: int, max_num_tokens: int,
+         max_seq_len: int, tp_size: int, pp_size: int, ep_size: Optional[int],
+         gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
+         trust_remote_code: bool, revision: Optional[str],
+         extra_llm_api_options: Optional[str], disable_kv_cache_reuse: bool):
     logger.set_level(log_level)
 
     kv_cache_config = KvCacheConfig(
@@ -125,6 +133,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
     llm_args = {
         "model": model,
         "tokenizer": tokenizer,
+        "custom_tokenizer": custom_tokenizer,
         "tensor_parallel_size": tp_size,
         "pipeline_parallel_size": pp_size,
         "moe_expert_parallel_size": ep_size,
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 7e08295ade..6943df0c1a 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -82,6 +82,7 @@ def _signal_handler_cleanup_child(signum, frame):
 def get_llm_args(
         model: str,
         tokenizer: Optional[str] = None,
+        custom_tokenizer: Optional[str] = None,
         backend: str = "pytorch",
         max_beam_width: int = BuildConfig.model_fields["max_beam_width"].
     default,
@@ -137,6 +138,7 @@ def get_llm_args(
         "model": model,
         "scheduler_config": scheduler_config,
         "tokenizer": tokenizer,
+        "custom_tokenizer": custom_tokenizer,
         "tensor_parallel_size": tensor_parallel_size,
         "pipeline_parallel_size": pipeline_parallel_size,
         "context_parallel_size": context_parallel_size,
@@ -262,6 +264,14 @@ class ChoiceWithAlias(click.Choice):
               default=None,
               help="Path | Name of the tokenizer."
               "Specify this value only if using TensorRT engine as model.")
+@click.option(
+    "--custom_tokenizer",
+    type=str,
+    default=None,
+    help=
+    "Custom tokenizer type: alias (e.g., 'deepseek_v32') or Python import path "
+    "(e.g., 'tensorrt_llm.tokenizer.deepseek_v32.DeepseekV32Tokenizer'). [Experimental]"
+)
 @click.option("--host",
               type=str,
               default="localhost",
@@ -418,22 +428,22 @@ class ChoiceWithAlias(click.Choice):
               default=None,
               help="[Experimental] Specify a custom chat template. "
               "Can be a file path or one-liner template string")
-def serve(model: str, tokenizer: Optional[str], host: str, port: int,
-          log_level: str, backend: str, max_beam_width: int,
-          max_batch_size: int, max_num_tokens: int, max_seq_len: int,
-          tensor_parallel_size: int, pipeline_parallel_size: int,
-          context_parallel_size: int, moe_expert_parallel_size: Optional[int],
-          moe_cluster_parallel_size: Optional[int],
-          gpus_per_node: Optional[int], free_gpu_memory_fraction: float,
-          num_postprocess_workers: int, trust_remote_code: bool,
-          revision: Optional[str], extra_llm_api_options: Optional[str],
-          reasoning_parser: Optional[str], tool_parser: Optional[str],
-          metadata_server_config_file: Optional[str],
-          server_role: Optional[str],
-          fail_fast_on_attention_window_too_large: bool,
-          otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
-          disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str],
-          custom_module_dirs: list[Path], chat_template: Optional[str]):
+def serve(
+        model: str, tokenizer: Optional[str], custom_tokenizer: Optional[str],
+        host: str, port: int, log_level: str, backend: str, max_beam_width: int,
+        max_batch_size: int, max_num_tokens: int, max_seq_len: int,
+        tensor_parallel_size: int, pipeline_parallel_size: int,
+        context_parallel_size: int, moe_expert_parallel_size: Optional[int],
+        moe_cluster_parallel_size: Optional[int], gpus_per_node: Optional[int],
+        free_gpu_memory_fraction: float, num_postprocess_workers: int,
+        trust_remote_code: bool, revision: Optional[str],
+        extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
+        tool_parser: Optional[str], metadata_server_config_file: Optional[str],
+        server_role: Optional[str],
+        fail_fast_on_attention_window_too_large: bool,
+        otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
+        disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str],
+        custom_module_dirs: list[Path], chat_template: Optional[str]):
     """Running an OpenAI API compatible server
 
     MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -450,6 +460,7 @@ def serve(model: str, tokenizer: Optional[str], host: str, port: int,
     llm_args, _ = get_llm_args(
         model=model,
         tokenizer=tokenizer,
+        custom_tokenizer=custom_tokenizer,
         backend=backend,
         max_beam_width=max_beam_width,
         max_batch_size=max_batch_size,
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index aa22dbc00a..a6dd960899 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -24,7 +24,8 @@ from tensorrt_llm.inputs.multimodal import (MultimodalServerConfig,
 from tensorrt_llm.inputs.registry import (MULTIMODAL_PLACEHOLDER_REGISTRY,
                                           MultimodalPlaceholderPlacement)
 from tensorrt_llm.llmapi.llm_utils import ModelLoader
-from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer
+from tensorrt_llm.tokenizer import TokenizerBase, TransformersTokenizer
+from tensorrt_llm.tokenizer.deepseek_v32 import DeepseekV32Tokenizer
 
 logger = logging.get_logger(__name__)
 
@@ -580,6 +581,18 @@ def apply_chat_template(
     if model_type in HF_CHAT_TEMPLATE_EXCEPTIONS:
         # special path for models like llava-llama
         return "".join([conv["content"] for conv in conversation])
+
+    # Handle DeepSeek V32 tokenizer with custom chat template
+    if isinstance(tokenizer, DeepseekV32Tokenizer):
+        prompt = tokenizer.apply_chat_template(
+            messages=conversation,
+            tools=tools,
+            **(chat_template_kwargs or {}),
+        )
+        if enable_tokenize:
+            return tokenizer.encode(prompt)
+        return prompt
+
     if isinstance(tokenizer, TransformersTokenizer):
         tokenizer = tokenizer.tokenizer  # we need the TokenizerBase for apply_chat_template
 
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 2f22f49340..d38dea63f7 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1838,6 +1838,14 @@ class BaseLlmArgs(StrictBaseModel):
         description="The mode to initialize the tokenizer.",
         json_schema_extra={"type": "Literal['auto', 'slow']"})
 
+    custom_tokenizer: Optional[str] = Field(
+        default=None,
+        description="Specify a custom tokenizer implementation. Accepts either: "
+        "(1) a built-in alias (e.g., 'deepseek_v32'), or "
+        "(2) a Python import path (e.g., 'tensorrt_llm.tokenizer.deepseek_v32.DeepseekV32Tokenizer'). "
+        "The tokenizer class must implement 'from_pretrained(path, **kwargs)' and the TokenizerBase interface.",
+        status="prototype")
+
     skip_tokenizer_init: bool = Field(
         default=False,
         description="Whether to skip the tokenizer initialization.")
@@ -2187,6 +2195,41 @@ class BaseLlmArgs(StrictBaseModel):
         """Initialize tokenizer based on configuration."""
         if self.skip_tokenizer_init:
             self.tokenizer = None
+        elif self.custom_tokenizer:
+            # If tokenizer is already a tokenizer object, custom_tokenizer is not compatible
+            if isinstance(self.tokenizer,
+                          (TokenizerBase, PreTrainedTokenizerBase)):
+                raise ValueError(
+                    "Cannot use custom_tokenizer when tokenizer is already a tokenizer object. "
+                    "Please specify a tokenizer path or leave it as None to load from model path."
+                )
+
+            # Support short aliases for built-in tokenizers
+            TOKENIZER_ALIASES = {
+                'deepseek_v32':
+                'tensorrt_llm.tokenizer.deepseek_v32.DeepseekV32Tokenizer',
+            }
+
+            tokenizer_path = TOKENIZER_ALIASES.get(self.custom_tokenizer,
+                                                   self.custom_tokenizer)
+
+            # Dynamically import and use custom tokenizer
+            from importlib import import_module
+            try:
+                module_path, class_name = tokenizer_path.rsplit('.', 1)
+                module = import_module(module_path)
+                tokenizer_class = getattr(module, class_name)
+                # Use tokenizer path if specified, otherwise use model path
+                load_path = self.tokenizer if self.tokenizer else self.model
+                self.tokenizer = tokenizer_class.from_pretrained(
+                    load_path,
+                    trust_remote_code=self.trust_remote_code,
+                    use_fast=self.tokenizer_mode != 'slow')
+            except (ValueError, ImportError, AttributeError) as e:
+                raise ValueError(
+                    f"Failed to load custom tokenizer '{self.custom_tokenizer}': {e}. "
+                    "Expected format: 'module.path.ClassName' or a recognized alias."
+                ) from e
         else:
             self.tokenizer = tokenizer_factory(
                 self.tokenizer,
diff --git a/tensorrt_llm/llmapi/tokenizer.py b/tensorrt_llm/llmapi/tokenizer.py
index 7e13643fb8..214ca86f97 100644
--- a/tensorrt_llm/llmapi/tokenizer.py
+++ b/tensorrt_llm/llmapi/tokenizer.py
@@ -1,365 +1,21 @@
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+# Backward compatibility shim - the tokenizer module has moved to tensorrt_llm.tokenizer
+# All imports from tensorrt_llm.llmapi.tokenizer will continue to work.
+from tensorrt_llm.tokenizer import (TLLM_INCREMENTAL_DETOKENIZATION_BACKEND,
+                                    TLLM_STREAM_INTERVAL_THRESHOLD,
+                                    TokenizerBase, TransformersTokenizer,
+                                    _llguidance_tokenizer_info,
+                                    _xgrammar_tokenizer_info, load_hf_tokenizer,
+                                    tokenizer_factory)
+from tensorrt_llm.tokenizer.deepseek_v32 import DeepseekV32Tokenizer
 
-from transformers import (AutoTokenizer, PreTrainedTokenizerBase,
-                          PreTrainedTokenizerFast)
-
-from .._utils import nvtx_range_debug
-from ..logger import logger
-
-TLLM_INCREMENTAL_DETOKENIZATION_BACKEND = os.environ.get(
-    "TLLM_INCREMENTAL_DETOKENIZATION_BACKEND", "HF")
-TLLM_STREAM_INTERVAL_THRESHOLD = int(
-    os.environ.get("TLLM_STREAM_INTERVAL_THRESHOLD", "24"))
-try:
-    from tokenizers.decoders import DecodeStream  # noqa
-except ImportError:
-    logger.warning(
-        f"HF incremental detokenization is unsupported by tokenizer<0.21.0; fallback to TRTLLM incremental detokenization."
-    )
-    TLLM_INCREMENTAL_DETOKENIZATION_BACKEND = "TRTLLM"
-
-
-class TokenizerBase(PreTrainedTokenizerBase):
-    ''' This is a protocol for the tokenizer. Users can implement their own tokenizer by inheriting this class.  '''
-
-
-class TransformersTokenizer(TokenizerBase):
-    ''' A wrapper for the Transformers' tokenizer.
-    This is the default tokenizer for LLM. '''
-
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-        self._all_special_tokens_set = set(self.tokenizer.all_special_tokens)
-
-    def __call__(self, text: str, *args, **kwargs) -> Any:
-        return self.tokenizer(text, *args, **kwargs)
-
-    @property
-    def eos_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
-
-    @property
-    def pad_token_id(self) -> int:
-        return self.tokenizer.pad_token_id
-
-    @property
-    def name_or_path(self) -> str:
-        return self.tokenizer.name_or_path
-
-    def encode(self, text: str, *args, **kwargs) -> List[int]:
-        return self.tokenizer.encode(text, *args, **kwargs)
-
-    def decode(self, token_ids: List[int], *args, **kwargs) -> str:
-        return self.tokenizer.decode(token_ids, *args, **kwargs)
-
-    def batch_encode_plus(self, texts: List[str], *args, **kwargs) -> dict:
-        return self.tokenizer.batch_encode_plus(texts, *args, **kwargs)
-
-    def get_chat_template(self,
-                          chat_template: Optional[str] = None,
-                          tools: Optional[List[Dict]] = None) -> str:
-        return self.tokenizer.get_chat_template(chat_template, tools)
-
-    def apply_chat_template(
-            self, conversation: Union[List[Dict[str, str]],
-                                      List[List[Dict[str, str]]]], *args,
-            **kwargs) -> Union[str, List[int], List[str], List[List[int]]]:
-        return self.tokenizer.apply_chat_template(conversation, *args, **kwargs)
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.tokenizer})"
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_dir: str, **kwargs):
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir,
-                                                  **kwargs)
-        return cls(tokenizer)
-
-    def save_pretrained(self, pretrained_model_dir: str, **kwargs):
-        self.tokenizer.save_pretrained(pretrained_model_dir, **kwargs)
-
-    def clean_up_tokenization(self, out_string: str) -> str:
-        return self.tokenizer.clean_up_tokenization(out_string)
-
-    @property
-    def clean_up_tokenization_spaces(self):
-        return self.tokenizer.clean_up_tokenization_spaces
-
-    @property
-    def is_fast(self) -> bool:
-        return self.tokenizer.is_fast
-
-    def get_added_vocab(self) -> Dict[str, int]:
-        # Assumed to be O(1) complexity
-        return self.tokenizer.get_added_vocab()
-
-    def convert_ids_to_tokens(
-            self,
-            ids: Union[int, List[int]],
-            skip_special_tokens: bool = False) -> Union[str, List[str]]:
-        return self.tokenizer.convert_ids_to_tokens(
-            ids, skip_special_tokens=skip_special_tokens)
-
-    def convert_tokens_to_string(
-            self,
-            tokens: List[str],
-            skip_special_tokens: bool = False,
-            spaces_between_special_tokens: bool = True) -> str:
-        # Adapted from
-        # https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/transformers_utils/detokenizer.py#L172
-        if self.is_fast or not self.get_added_vocab():
-            return self.tokenizer.convert_tokens_to_string(tokens)
-
-        sub_texts: List[str] = []
-        current_sub_text: List[str] = []
-        for token in tokens:
-            if skip_special_tokens and token in self._all_special_tokens_set:
-                continue
-            if token in self.get_added_vocab():
-                if current_sub_text:
-                    sub_text = self.tokenizer.convert_tokens_to_string(
-                        current_sub_text)
-                    sub_texts.append(sub_text)
-                    current_sub_text = []
-                sub_texts.append(token)
-            else:
-                current_sub_text.append(token)
-        if current_sub_text:
-            sub_text = self.tokenizer.convert_tokens_to_string(current_sub_text)
-            sub_texts.append(sub_text)
-        if spaces_between_special_tokens:
-            return " ".join(sub_texts)
-        else:
-            return "".join(sub_texts)
-
-    @nvtx_range_debug("decode_incrementally")
-    def decode_incrementally(
-            self,
-            token_ids: List[int],
-            prev_text: Optional[str] = None,
-            states: Optional[dict] = None,
-            *,
-            flush: bool = False,
-            skip_special_tokens: bool = False,
-            clean_up_tokenization_spaces: Optional[bool] = None,
-            spaces_between_special_tokens: bool = True,
-            stream_interval: int = 1) -> Tuple[str, dict]:
-        """Incremental detokenization, typically used for streaming generation.
-
-        Args:
-            token_ids (List[int]): The incremental token ids.
-            prev_text (str): The previous decoded text. None if it's the first iteration.
-            states (dict): A dict that saves previous states for incremental detokenization. None if it's the first iteration.
-            flush (bool): Force flushing the pending tokens to decoded text.
-            skip_special_tokens (bool): Whether to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (bool): Whether to clean up tokenization spaces.
-            spaces_between_special_tokens (bool): Whether to add spaces between special tokens.
-            stream_interval (int): The iteration interval to create responses under the streaming mode.
-
-        Returns:
-            text, states (Tuple[str, dict]): text is the current decoded text, states is the current incremental detokenization states.
-            They should be passed to next incremental detokenization iteration, if any.
-        """
-        # HF incremental detokenization implementation is faster than TRTLLM when stream_interval is smaller.
-        if (TLLM_INCREMENTAL_DETOKENIZATION_BACKEND == "TRTLLM"
-                or stream_interval >= TLLM_STREAM_INTERVAL_THRESHOLD
-                or spaces_between_special_tokens is False
-                or not hasattr(self.tokenizer, "_tokenizer")):
-            return self.trtllm_decode_incrementally(
-                token_ids,
-                prev_text,
-                states,
-                flush=flush,
-                skip_special_tokens=skip_special_tokens,
-                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-                spaces_between_special_tokens=spaces_between_special_tokens)
-        else:
-            return self.hf_decode_incrementally(
-                token_ids,
-                prev_text,
-                states,
-                skip_special_tokens=skip_special_tokens,
-                clean_up_tokenization_spaces=clean_up_tokenization_spaces)
-
-    def trtllm_decode_incrementally(
-            self,
-            token_ids: List[int],
-            prev_text: Optional[str] = None,
-            states: Optional[dict] = None,
-            *,
-            flush: bool = False,
-            skip_special_tokens: bool = False,
-            clean_up_tokenization_spaces: Optional[bool] = None,
-            spaces_between_special_tokens: bool = True) -> Tuple[str, dict]:
-        # Adapted from
-        # https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/transformers_utils/detokenizer.py#L238
-        if prev_text is None:
-            prev_text = ""
-
-        if states is None:
-            states = {}
-        last_new_tokens = states.pop('last_new_tokens', [])
-        pending_tokens = states.pop('pending_tokens', [])
-
-        if len(last_new_tokens) > 0:
-            last_new_text = self.convert_tokens_to_string(
-                last_new_tokens,
-                skip_special_tokens=skip_special_tokens,
-                spaces_between_special_tokens=spaces_between_special_tokens)
-        else:
-            last_new_text = ""
-
-        new_tokens = self.convert_ids_to_tokens(
-            token_ids, skip_special_tokens=skip_special_tokens)
-        pending_tokens.extend(new_tokens)
-
-        curr_new_text = self.convert_tokens_to_string(
-            last_new_tokens + pending_tokens,
-            skip_special_tokens=skip_special_tokens,
-            spaces_between_special_tokens=spaces_between_special_tokens)
-        if not flush and (len(curr_new_text.rstrip()) <= len(
-                last_new_text.rstrip()) or curr_new_text.endswith("�")):
-            return prev_text, {
-                'last_new_tokens': last_new_tokens,
-                'pending_tokens': pending_tokens
-            }
-
-        # Remove the part of last_new_text
-        curr_new_text = curr_new_text[len(last_new_text):]
-        if clean_up_tokenization_spaces is None:
-            clean_up_tokenization_spaces = self.clean_up_tokenization_spaces
-        if clean_up_tokenization_spaces:
-            curr_new_text = self.clean_up_tokenization(curr_new_text)
-        return prev_text + curr_new_text, {'last_new_tokens': pending_tokens}
-
-    def hf_decode_incrementally(
-        self,
-        token_ids: List[int],
-        prev_text: Optional[str] = None,
-        states: Optional[dict] = None,
-        *,
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None
-    ) -> Tuple[str, dict]:
-        if states is None:
-            states = {
-                'decode_stream':
-                DecodeStream(skip_special_tokens=skip_special_tokens)
-            }
-
-        decode_stream = states.get('decode_stream')
-        results = [
-            result for tid in token_ids
-            if (result := decode_stream.step(self.tokenizer._tokenizer, tid)
-                ) is not None
-        ]
-        curr_new_text = "".join(results)
-        if clean_up_tokenization_spaces is None:
-            clean_up_tokenization_spaces = self.clean_up_tokenization_spaces
-        if clean_up_tokenization_spaces:
-            curr_new_text = self.clean_up_tokenization(curr_new_text)
-
-        if prev_text is None:
-            return curr_new_text, states
-        else:
-            return prev_text + curr_new_text, states
-
-
-def tokenizer_factory(obj: Optional[Union[str, Path, PreTrainedTokenizerBase,
-                                          TokenizerBase]] = None,
-                      **kwargs) -> Optional[TokenizerBase]:
-    if obj is None:
-        return None
-    elif isinstance(obj, (str, Path)):
-        default_kwargs = {
-            'legacy': False,
-            'padding_side': 'left',
-            'truncation_side': 'left',
-            'trust_remote_code': True,
-            'use_fast': True,
-        }
-        default_kwargs.update(kwargs)
-        return TransformersTokenizer.from_pretrained(obj, **default_kwargs)
-    elif isinstance(obj, TokenizerBase):
-        return obj
-    elif isinstance(obj, PreTrainedTokenizerBase):
-        return TransformersTokenizer(obj)
-    else:
-        raise TypeError(f"Unrecognized tokenizer {obj}")
-
-
-def _xgrammar_tokenizer_info(tokenizer):
-    # Reference: https://github.com/mlc-ai/xgrammar/blob/b9a16de54e1e0eff58da14c65750414cceaf1a6f/python/xgrammar/tokenizer_info.py#L133
-    if isinstance(tokenizer, TokenizerBase):
-        tokenizer = tokenizer.tokenizer
-
-    stop_token_ids = [tokenizer.eos_token_id]
-
-    try:
-        encoded_vocab = tokenizer.get_vocab()
-        encoded_vocab = [
-            token
-            for token, _ in sorted(encoded_vocab.items(), key=lambda x: x[1])
-        ]
-    except AttributeError as e:
-        msg = (
-            f"Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer "
-            "should have a get_vocab method.")
-        raise ValueError(msg) from e
-
-    if isinstance(tokenizer, PreTrainedTokenizerFast):
-        backend_str = tokenizer.backend_tokenizer.to_str()
-        return {
-            "encoded_vocab": encoded_vocab,
-            "tokenizer_str": backend_str,
-            "stop_token_ids": stop_token_ids
-        }
-    elif ("vocab_file" in tokenizer.vocab_files_names
-          and "tiktoken" in tokenizer.vocab_files_names["vocab_file"]):
-        return {
-            "encoded_vocab": encoded_vocab,
-            "stop_token_ids": stop_token_ids
-        }
-    else:
-        raise ValueError(f"Unsupported tokenizer type: {type(tokenizer)}")
-
-
-def _llguidance_tokenizer_info(tokenizer):
-    tokenizer_info = _xgrammar_tokenizer_info(tokenizer)
-    if tokenizer_info.get("tokenizer_str") is None:
-        raise ValueError("missing tokenizer_str")
-    return tokenizer_info
-
-
-def load_hf_tokenizer(model_dir: str,
-                      trust_remote_code: bool = True,
-                      use_fast: bool = True,
-                      **kwargs) -> Optional[TransformersTokenizer]:
-    ''' Load a tokenizer from a Hugging Face model directory.
-
-    Args:
-        model_dir (str): The model directory.
-        trust_remote_code (bool): Whether to trust the remote code.
-        use_fast (bool): Whether to use the fast tokenizer.
-
-    Returns:
-        A TransformersTokenizer object if the tokenizer is loaded successfully.
-    '''
-
-    try:
-        return TransformersTokenizer.from_pretrained(
-            model_dir,
-            legacy=False,
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=trust_remote_code,
-            use_fast=use_fast,
-            **kwargs)
-
-    except Exception as e:
-        logger.warning(
-            f"Failed to load hf tokenizer from {model_dir}, encounter error: {e}"
-        )
-        return None
+__all__ = [
+    "TLLM_INCREMENTAL_DETOKENIZATION_BACKEND",
+    "TLLM_STREAM_INTERVAL_THRESHOLD",
+    "TokenizerBase",
+    "TransformersTokenizer",
+    "DeepseekV32Tokenizer",
+    "tokenizer_factory",
+    "_xgrammar_tokenizer_info",
+    "_llguidance_tokenizer_info",
+    "load_hf_tokenizer",
+]
diff --git a/tensorrt_llm/tokenizer/__init__.py b/tensorrt_llm/tokenizer/__init__.py
new file mode 100644
index 0000000000..56beb13076
--- /dev/null
+++ b/tensorrt_llm/tokenizer/__init__.py
@@ -0,0 +1,21 @@
+from .tokenizer import (
+    TLLM_INCREMENTAL_DETOKENIZATION_BACKEND,
+    TLLM_STREAM_INTERVAL_THRESHOLD,
+    TokenizerBase,
+    TransformersTokenizer,
+    _llguidance_tokenizer_info,
+    _xgrammar_tokenizer_info,
+    load_hf_tokenizer,
+    tokenizer_factory,
+)
+
+__all__ = [
+    "TLLM_INCREMENTAL_DETOKENIZATION_BACKEND",
+    "TLLM_STREAM_INTERVAL_THRESHOLD",
+    "TokenizerBase",
+    "TransformersTokenizer",
+    "tokenizer_factory",
+    "_xgrammar_tokenizer_info",
+    "_llguidance_tokenizer_info",
+    "load_hf_tokenizer",
+]
diff --git a/tensorrt_llm/tokenizer/deepseek_v32/__init__.py b/tensorrt_llm/tokenizer/deepseek_v32/__init__.py
new file mode 100644
index 0000000000..9ca9ee21b9
--- /dev/null
+++ b/tensorrt_llm/tokenizer/deepseek_v32/__init__.py
@@ -0,0 +1,14 @@
+"""DeepSeek V3.2 tokenizer and encoding utilities.
+
+This is a temporary workaround for DeepSeek-V3.2 model as HF does not support it yet.
+TODO: Remove this once HF supports DeepSeek-V3.2
+"""
+
+from .encoding import encode_messages, parse_message_from_completion_text
+from .tokenizer import DeepseekV32Tokenizer
+
+__all__ = [
+    "DeepseekV32Tokenizer",
+    "encode_messages",
+    "parse_message_from_completion_text",
+]
diff --git a/tensorrt_llm/tokenizer/deepseek_v32/encoding.py b/tensorrt_llm/tokenizer/deepseek_v32/encoding.py
new file mode 100644
index 0000000000..24833b7b02
--- /dev/null
+++ b/tensorrt_llm/tokenizer/deepseek_v32/encoding.py
@@ -0,0 +1,425 @@
+# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
+# ruff: noqa: E501
+import copy
+import json
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+TOOLS_SYSTEM_TEMPLATE = """## Tools
+You have access to a set of tools you can use to answer the user's question.
+You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
+<{dsml_token}function_calls>
+<{dsml_token}invoke name="$FUNCTION_NAME">
+<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
+...
+</{dsml_token}invoke>
+<{dsml_token}invoke name="$FUNCTION_NAME2">
+...
+</{dsml_token}invoke>
+</{dsml_token}function_calls>
+String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
+If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
+<{dsml_token}function_calls>
+...
+</{dsml_token}function_calls>
+<function_results>
+...
+</function_results>
+{thinking_start_token}...thinking about results{thinking_end_token}
+Here are the functions available in JSONSchema format:
+<functions>
+{tool_schemas}
+</functions>
+"""
+
+bos_token: str = "<｜begin▁of▁sentence｜>"
+eos_token: str = "<｜end▁of▁sentence｜>"
+thinking_start_token: str = "<think>"
+thinking_end_token: str = "</think>"
+dsml_token: str = "｜DSML｜"
+system_msg_template: str = "{content}"
+user_msg_template: str = "<｜User｜>{content}<｜Assistant｜>"
+assistant_msg_template: str = "{reasoning}{content}{tool_calls}<｜end▁of▁sentence｜>"
+thinking_template = "{reasoning_content}"
+
+response_format_template: str = (
+    "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
+)
+tool_call_template: str = '<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
+tool_calls_template = "<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
+
+tool_output_template: str = "\n<result>{content}</result>"
+
+
+def to_json(value: Any) -> str:
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except Exception:
+        return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+    return [tool["function"] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+    return [
+        {
+            "name": tool_call["function"]["name"],
+            "arguments": tool_call["function"]["arguments"],
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": tool_call["name"],
+                "arguments": tool_call["arguments"],
+            },
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str:
+    p_dsml_template = (
+        """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
+    )
+    P_dsml_strs = []
+
+    arguments = json.loads(tool_call["arguments"])
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str="true" if isinstance(v, str) else "false",
+            value=v if isinstance(v, str) else to_json(v),
+        )
+
+        P_dsml_strs.append(p_dsml_str)
+
+    return "\n".join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(
+    tool_name: str, tool_args: Dict[str, Tuple[str, str]]
+) -> Dict[str, str]:
+    def _decode_value(key: str, value: str, string: str):
+        if string == "true":
+            value = to_json(value)
+        return f"{to_json(key)}: {value}"
+
+    tool_args_json = (
+        "{"
+        + ", ".join([_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()])
+        + "}"
+    )
+    return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: List[Dict[str, Union[str, Dict[str, Any]]]]) -> str:
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_SYSTEM_TEMPLATE.format(
+        tool_schemas="\n".join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+
+def find_last_user_index(messages: List[Dict[str, Any]]) -> int:
+    last_user_index = -1
+    for idx in range(len(messages) - 1, -1, -1):
+        if messages[idx].get("role") in ["user", "developer"]:
+            last_user_index = idx
+            break
+    return last_user_index
+
+
+def render_message(index: int, messages: List[Dict[str, Any]], thinking_mode: str) -> str:
+    assert 0 <= index < len(messages)
+    assert thinking_mode in ["chat", "thinking"], f"Invalid thinking_mode `{thinking_mode}`"
+
+    prompt = ""
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get("role")
+    content = msg.get("content")
+    tools = msg.get("tools")
+    response_format = msg.get("response_format")
+    tool_calls = msg.get("tool_calls")
+    # support both reasoning_content and reasoning for compatibility
+    reasoning_content = msg.get("reasoning") or msg.get("reasoning_content")
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    if role == "system":
+        prompt += system_msg_template.format(content=content or "")
+        if tools:
+            prompt += "\n\n" + render_tools(tools)
+
+        if response_format:
+            prompt += "\n\n" + response_format_template.format(schema=to_json(response_format))
+
+    elif role == "developer":
+        assert content, f"Invalid message for role `{role}`: {msg}"
+        content_developer = ""
+        if tools:
+            content_developer += "\n\n" + render_tools(tools)
+
+        if response_format:
+            content_developer += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+        content_developer += "\n\n# The user's message is: {}".format(content)
+
+        prompt += user_msg_template.format(content=content_developer)
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "user":
+        prompt += user_msg_template.format(content=content)
+
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "tool":
+        prev_assistant_idx = index - 1
+        assistant_msg = messages[prev_assistant_idx]
+        while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
+            prev_assistant_idx -= 1
+            assistant_msg = messages[prev_assistant_idx]
+
+        assert index == 0 or prev_assistant_idx >= 0 and assistant_msg.get("role") == "assistant", (
+            f"Invalid messages at {index}:\n{assistant_msg}"
+        )
+
+        tool_call_order = index - prev_assistant_idx
+        assistant_tool_calls = assistant_msg.get("tool_calls")
+        assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, (
+            "No tool calls but found tool output"
+        )
+
+        if tool_call_order == 1:
+            prompt += "\n\n<function_results>"
+
+        prompt += tool_output_template.format(content=content)
+
+        if tool_call_order == len(assistant_tool_calls):
+            prompt += "\n</function_results>"
+
+            if index >= last_user_idx and thinking_mode == "thinking":
+                prompt += "\n\n" + thinking_start_token
+            else:
+                prompt += "\n\n" + thinking_end_token
+
+    elif role == "assistant":
+        prev_assistant_idx = index
+        thinking_part = ""
+
+        tool_calls_content = ""
+        if tool_calls:
+            tool_calls = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tool_call.get("name"),
+                    arguments=encode_arguments_to_dsml(tool_call),
+                )
+                for tool_call in tool_calls
+            ]
+            tool_calls_content += "\n\n" + tool_calls_template.format(
+                dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
+            )
+
+        summary_content = content or ""
+
+        if thinking_mode == "thinking" and index > last_user_idx:
+            assert reasoning_content or tool_calls, (
+                f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message"
+            )
+            thinking_part = (
+                thinking_template.format(reasoning_content=reasoning_content or "")
+                + thinking_end_token
+            )
+
+        prompt += assistant_msg_template.format(
+            reasoning=thinking_part,
+            content=summary_content,
+            tool_calls=tool_calls_content,
+        )
+    else:
+        raise NotImplementedError(f"Unknown role: {role}")
+
+    return prompt
+
+
+def drop_thinking_messages(
+    messages: List[Dict[str, Any]], last_user_idx: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    messages_wo_thinking: List[Dict[str, Any]] = []
+    last_user_idx = find_last_user_index(messages) if last_user_idx is None else last_user_idx
+    for idx, msg in enumerate(messages):
+        role = msg.get("role")
+        if role in ["user", "system", "tool"] or idx >= last_user_idx:
+            messages_wo_thinking.append(msg)
+            continue
+
+        elif role == "assistant":
+            msg_wo_thinking = copy.copy(msg)
+            msg_wo_thinking.pop("reasoning_content", None)
+            msg_wo_thinking.pop("reasoning", None)
+            messages_wo_thinking.append(msg_wo_thinking)
+
+    return messages_wo_thinking
+
+
+def encode_messages(
+    messages: List[Dict[str, Any]],
+    thinking_mode: str,
+    context: Optional[List[Dict[str, Any]]] = None,
+    drop_thinking: bool = True,
+    add_default_bos_token: bool = True,
+) -> str:
+    context = context if context else []
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
+
+    if thinking_mode == "thinking" and drop_thinking:
+        full_messages = drop_thinking_messages(full_messages)
+
+    for idx in range(len(messages)):
+        prompt += render_message(idx + len(context), full_messages, thinking_mode=thinking_mode)
+
+    return prompt
+
+
+def _read_until_stop(index: int, text: str, stop: List[str]) -> Tuple[int, str, Optional[str]]:
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str):
+    tool_calls: List[Dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f"</{dsml_token}function_calls>"
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
+        )
+        assert _ == ">\n", "Tool call format error"
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        assert stop_token is not None, "Missing special token"
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+        )
+
+        p_tool_name = re.findall(r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL)
+        assert len(p_tool_name) == 1, "Tool name format error"
+        tool_name = p_tool_name[0]
+
+        tool_args: Dict[str, Tuple[str, str]] = {}
+        while stop_token == f"<{dsml_token}parameter":
+            index, param_content, stop_token = _read_until_stop(
+                index, text, [f"/{dsml_token}parameter"]
+            )
+
+            param_kv = re.findall(
+                r'^ name="(.*?)" string="(true|false)">(.*?)<$', param_content, flags=re.DOTALL
+            )
+            assert len(param_kv) == 1, "Parameter format error"
+            param_name, string, param_value = param_kv[0]
+
+            assert param_name not in tool_args, "Duplicate parameter name"
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+            )
+            assert content == ">\n", "Parameter format error"
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+
+# NOTE: This function is designed to parse only correctly
+# formatted string and will not attempt to correct malformed output
+# that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+    summary_content, reasoning_content, tool_calls = "", "", []
+    index, stop_token = 0, None
+    tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
+
+    is_thinking, is_tool_calling = thinking_mode == "thinking", False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(
+            index, text, [thinking_end_token, tool_calls_start_token]
+        )
+        reasoning_content = content_delta
+        assert stop_token == thinking_end_token, "Invalid thinking format"
+
+    index, content_delta, stop_token = _read_until_stop(
+        index, text, [eos_token, tool_calls_start_token]
+    )
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        assert stop_token == eos_token, "Invalid summary format"
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        assert not tool_ends_text, "Unexpected content after tool calls"
+
+    assert len(text) == index and stop_token in [eos_token, None], "Unexpected content at end"
+
+    for sp_token in [bos_token, eos_token, thinking_start_token, thinking_end_token, dsml_token]:
+        assert sp_token not in summary_content and sp_token not in reasoning_content, (
+            "Unexpected special token in content"
+        )
+
+    return {
+        "role": "assistant",
+        "content": summary_content,
+        "reasoning_content": reasoning_content,
+        "reasoning": reasoning_content,
+        "tool_calls": tool_calls_to_openai_format(tool_calls),
+    }
diff --git a/tensorrt_llm/tokenizer/deepseek_v32/tokenizer.py b/tensorrt_llm/tokenizer/deepseek_v32/tokenizer.py
new file mode 100644
index 0000000000..4420955f45
--- /dev/null
+++ b/tensorrt_llm/tokenizer/deepseek_v32/tokenizer.py
@@ -0,0 +1,147 @@
+"""DeepSeek V3.2 tokenizer implementation.
+
+This is a temporary workaround for DeepSeek-V3.2 model as HF does not support it yet.
+TODO: Remove this once HF supports DeepSeek-V3.2
+"""
+
+from pathlib import Path
+from typing import Any
+
+from transformers import AutoTokenizer
+
+from ..tokenizer import TransformersTokenizer
+from .encoding import encode_messages
+
+
+class DeepseekV32Tokenizer(TransformersTokenizer):
+    """DeepSeek V3.2 tokenizer with custom chat template."""
+
+    def __init__(self, tokenizer):
+        # tokenizer should be the HF tokenizer
+        self.tokenizer = tokenizer
+        self._all_special_tokens_set = set(self.tokenizer.all_special_tokens)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "DeepseekV32Tokenizer":
+        # Load HF tokenizer
+        hf_tokenizer = AutoTokenizer.from_pretrained(
+            path_or_repo_id,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
+        return DeepseekV32Tokenizer(hf_tokenizer)
+
+    def apply_chat_template(self, messages, tools=None, **kwargs):
+        thinking = kwargs.get("thinking", False)
+        thinking_mode = "thinking" if thinking else "chat"
+        messages = messages.copy()
+        drop_thinking = True
+        if tools is not None and len(tools) > 0:
+            messages.insert(0, {"role": "system"})
+            messages[0]["tools"] = tools
+            drop_thinking = False
+        encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
+        prompt_str = encode_messages(messages, **encode_config)  # type: ignore
+        return prompt_str
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self.tokenizer.all_special_tokens
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return self.tokenizer.all_special_ids
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_token_id
+
+    @property
+    def is_fast(self) -> bool:
+        return self.tokenizer.is_fast
+
+    @property
+    def vocab_size(self) -> int:
+        return self.tokenizer.vocab_size
+
+    @property
+    def max_token_id(self) -> int:
+        return self.tokenizer.max_token_id
+
+    @property
+    def truncation_side(self) -> str:
+        return self.tokenizer.truncation_side
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        # </think> is an added token in DeepseekV32 tokenizer
+        return self.vocab_size + len(self.get_added_vocab())
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> Any:
+        return self.tokenizer(
+            text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            max_length=max_length,
+        )
+
+    def get_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_vocab()
+
+    def get_added_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_added_vocab()
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        return self.tokenizer.encode(
+            text,
+            truncation=truncation,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+        )
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return self.tokenizer.convert_tokens_to_string(tokens)
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False, **kwargs) -> str:
+        return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = False,
+    ) -> list[str]:
+        return self.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
diff --git a/tensorrt_llm/tokenizer/tokenizer.py b/tensorrt_llm/tokenizer/tokenizer.py
new file mode 100644
index 0000000000..7e13643fb8
--- /dev/null
+++ b/tensorrt_llm/tokenizer/tokenizer.py
@@ -0,0 +1,365 @@
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from transformers import (AutoTokenizer, PreTrainedTokenizerBase,
+                          PreTrainedTokenizerFast)
+
+from .._utils import nvtx_range_debug
+from ..logger import logger
+
+TLLM_INCREMENTAL_DETOKENIZATION_BACKEND = os.environ.get(
+    "TLLM_INCREMENTAL_DETOKENIZATION_BACKEND", "HF")
+TLLM_STREAM_INTERVAL_THRESHOLD = int(
+    os.environ.get("TLLM_STREAM_INTERVAL_THRESHOLD", "24"))
+try:
+    from tokenizers.decoders import DecodeStream  # noqa
+except ImportError:
+    logger.warning(
+        f"HF incremental detokenization is unsupported by tokenizer<0.21.0; fallback to TRTLLM incremental detokenization."
+    )
+    TLLM_INCREMENTAL_DETOKENIZATION_BACKEND = "TRTLLM"
+
+
+class TokenizerBase(PreTrainedTokenizerBase):
+    ''' This is a protocol for the tokenizer. Users can implement their own tokenizer by inheriting this class.  '''
+
+
+class TransformersTokenizer(TokenizerBase):
+    ''' A wrapper for the Transformers' tokenizer.
+    This is the default tokenizer for LLM. '''
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self._all_special_tokens_set = set(self.tokenizer.all_special_tokens)
+
+    def __call__(self, text: str, *args, **kwargs) -> Any:
+        return self.tokenizer(text, *args, **kwargs)
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_token_id
+
+    @property
+    def name_or_path(self) -> str:
+        return self.tokenizer.name_or_path
+
+    def encode(self, text: str, *args, **kwargs) -> List[int]:
+        return self.tokenizer.encode(text, *args, **kwargs)
+
+    def decode(self, token_ids: List[int], *args, **kwargs) -> str:
+        return self.tokenizer.decode(token_ids, *args, **kwargs)
+
+    def batch_encode_plus(self, texts: List[str], *args, **kwargs) -> dict:
+        return self.tokenizer.batch_encode_plus(texts, *args, **kwargs)
+
+    def get_chat_template(self,
+                          chat_template: Optional[str] = None,
+                          tools: Optional[List[Dict]] = None) -> str:
+        return self.tokenizer.get_chat_template(chat_template, tools)
+
+    def apply_chat_template(
+            self, conversation: Union[List[Dict[str, str]],
+                                      List[List[Dict[str, str]]]], *args,
+            **kwargs) -> Union[str, List[int], List[str], List[List[int]]]:
+        return self.tokenizer.apply_chat_template(conversation, *args, **kwargs)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.tokenizer})"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_dir: str, **kwargs):
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir,
+                                                  **kwargs)
+        return cls(tokenizer)
+
+    def save_pretrained(self, pretrained_model_dir: str, **kwargs):
+        self.tokenizer.save_pretrained(pretrained_model_dir, **kwargs)
+
+    def clean_up_tokenization(self, out_string: str) -> str:
+        return self.tokenizer.clean_up_tokenization(out_string)
+
+    @property
+    def clean_up_tokenization_spaces(self):
+        return self.tokenizer.clean_up_tokenization_spaces
+
+    @property
+    def is_fast(self) -> bool:
+        return self.tokenizer.is_fast
+
+    def get_added_vocab(self) -> Dict[str, int]:
+        # Assumed to be O(1) complexity
+        return self.tokenizer.get_added_vocab()
+
+    def convert_ids_to_tokens(
+            self,
+            ids: Union[int, List[int]],
+            skip_special_tokens: bool = False) -> Union[str, List[str]]:
+        return self.tokenizer.convert_ids_to_tokens(
+            ids, skip_special_tokens=skip_special_tokens)
+
+    def convert_tokens_to_string(
+            self,
+            tokens: List[str],
+            skip_special_tokens: bool = False,
+            spaces_between_special_tokens: bool = True) -> str:
+        # Adapted from
+        # https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/transformers_utils/detokenizer.py#L172
+        if self.is_fast or not self.get_added_vocab():
+            return self.tokenizer.convert_tokens_to_string(tokens)
+
+        sub_texts: List[str] = []
+        current_sub_text: List[str] = []
+        for token in tokens:
+            if skip_special_tokens and token in self._all_special_tokens_set:
+                continue
+            if token in self.get_added_vocab():
+                if current_sub_text:
+                    sub_text = self.tokenizer.convert_tokens_to_string(
+                        current_sub_text)
+                    sub_texts.append(sub_text)
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_text = self.tokenizer.convert_tokens_to_string(current_sub_text)
+            sub_texts.append(sub_text)
+        if spaces_between_special_tokens:
+            return " ".join(sub_texts)
+        else:
+            return "".join(sub_texts)
+
+    @nvtx_range_debug("decode_incrementally")
+    def decode_incrementally(
+            self,
+            token_ids: List[int],
+            prev_text: Optional[str] = None,
+            states: Optional[dict] = None,
+            *,
+            flush: bool = False,
+            skip_special_tokens: bool = False,
+            clean_up_tokenization_spaces: Optional[bool] = None,
+            spaces_between_special_tokens: bool = True,
+            stream_interval: int = 1) -> Tuple[str, dict]:
+        """Incremental detokenization, typically used for streaming generation.
+
+        Args:
+            token_ids (List[int]): The incremental token ids.
+            prev_text (str): The previous decoded text. None if it's the first iteration.
+            states (dict): A dict that saves previous states for incremental detokenization. None if it's the first iteration.
+            flush (bool): Force flushing the pending tokens to decoded text.
+            skip_special_tokens (bool): Whether to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (bool): Whether to clean up tokenization spaces.
+            spaces_between_special_tokens (bool): Whether to add spaces between special tokens.
+            stream_interval (int): The iteration interval to create responses under the streaming mode.
+
+        Returns:
+            text, states (Tuple[str, dict]): text is the current decoded text, states is the current incremental detokenization states.
+            They should be passed to next incremental detokenization iteration, if any.
+        """
+        # HF incremental detokenization implementation is faster than TRTLLM when stream_interval is smaller.
+        if (TLLM_INCREMENTAL_DETOKENIZATION_BACKEND == "TRTLLM"
+                or stream_interval >= TLLM_STREAM_INTERVAL_THRESHOLD
+                or spaces_between_special_tokens is False
+                or not hasattr(self.tokenizer, "_tokenizer")):
+            return self.trtllm_decode_incrementally(
+                token_ids,
+                prev_text,
+                states,
+                flush=flush,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                spaces_between_special_tokens=spaces_between_special_tokens)
+        else:
+            return self.hf_decode_incrementally(
+                token_ids,
+                prev_text,
+                states,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces)
+
+    def trtllm_decode_incrementally(
+            self,
+            token_ids: List[int],
+            prev_text: Optional[str] = None,
+            states: Optional[dict] = None,
+            *,
+            flush: bool = False,
+            skip_special_tokens: bool = False,
+            clean_up_tokenization_spaces: Optional[bool] = None,
+            spaces_between_special_tokens: bool = True) -> Tuple[str, dict]:
+        # Adapted from
+        # https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/transformers_utils/detokenizer.py#L238
+        if prev_text is None:
+            prev_text = ""
+
+        if states is None:
+            states = {}
+        last_new_tokens = states.pop('last_new_tokens', [])
+        pending_tokens = states.pop('pending_tokens', [])
+
+        if len(last_new_tokens) > 0:
+            last_new_text = self.convert_tokens_to_string(
+                last_new_tokens,
+                skip_special_tokens=skip_special_tokens,
+                spaces_between_special_tokens=spaces_between_special_tokens)
+        else:
+            last_new_text = ""
+
+        new_tokens = self.convert_ids_to_tokens(
+            token_ids, skip_special_tokens=skip_special_tokens)
+        pending_tokens.extend(new_tokens)
+
+        curr_new_text = self.convert_tokens_to_string(
+            last_new_tokens + pending_tokens,
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens)
+        if not flush and (len(curr_new_text.rstrip()) <= len(
+                last_new_text.rstrip()) or curr_new_text.endswith("�")):
+            return prev_text, {
+                'last_new_tokens': last_new_tokens,
+                'pending_tokens': pending_tokens
+            }
+
+        # Remove the part of last_new_text
+        curr_new_text = curr_new_text[len(last_new_text):]
+        if clean_up_tokenization_spaces is None:
+            clean_up_tokenization_spaces = self.clean_up_tokenization_spaces
+        if clean_up_tokenization_spaces:
+            curr_new_text = self.clean_up_tokenization(curr_new_text)
+        return prev_text + curr_new_text, {'last_new_tokens': pending_tokens}
+
+    def hf_decode_incrementally(
+        self,
+        token_ids: List[int],
+        prev_text: Optional[str] = None,
+        states: Optional[dict] = None,
+        *,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None
+    ) -> Tuple[str, dict]:
+        if states is None:
+            states = {
+                'decode_stream':
+                DecodeStream(skip_special_tokens=skip_special_tokens)
+            }
+
+        decode_stream = states.get('decode_stream')
+        results = [
+            result for tid in token_ids
+            if (result := decode_stream.step(self.tokenizer._tokenizer, tid)
+                ) is not None
+        ]
+        curr_new_text = "".join(results)
+        if clean_up_tokenization_spaces is None:
+            clean_up_tokenization_spaces = self.clean_up_tokenization_spaces
+        if clean_up_tokenization_spaces:
+            curr_new_text = self.clean_up_tokenization(curr_new_text)
+
+        if prev_text is None:
+            return curr_new_text, states
+        else:
+            return prev_text + curr_new_text, states
+
+
+def tokenizer_factory(obj: Optional[Union[str, Path, PreTrainedTokenizerBase,
+                                          TokenizerBase]] = None,
+                      **kwargs) -> Optional[TokenizerBase]:
+    if obj is None:
+        return None
+    elif isinstance(obj, (str, Path)):
+        default_kwargs = {
+            'legacy': False,
+            'padding_side': 'left',
+            'truncation_side': 'left',
+            'trust_remote_code': True,
+            'use_fast': True,
+        }
+        default_kwargs.update(kwargs)
+        return TransformersTokenizer.from_pretrained(obj, **default_kwargs)
+    elif isinstance(obj, TokenizerBase):
+        return obj
+    elif isinstance(obj, PreTrainedTokenizerBase):
+        return TransformersTokenizer(obj)
+    else:
+        raise TypeError(f"Unrecognized tokenizer {obj}")
+
+
+def _xgrammar_tokenizer_info(tokenizer):
+    # Reference: https://github.com/mlc-ai/xgrammar/blob/b9a16de54e1e0eff58da14c65750414cceaf1a6f/python/xgrammar/tokenizer_info.py#L133
+    if isinstance(tokenizer, TokenizerBase):
+        tokenizer = tokenizer.tokenizer
+
+    stop_token_ids = [tokenizer.eos_token_id]
+
+    try:
+        encoded_vocab = tokenizer.get_vocab()
+        encoded_vocab = [
+            token
+            for token, _ in sorted(encoded_vocab.items(), key=lambda x: x[1])
+        ]
+    except AttributeError as e:
+        msg = (
+            f"Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer "
+            "should have a get_vocab method.")
+        raise ValueError(msg) from e
+
+    if isinstance(tokenizer, PreTrainedTokenizerFast):
+        backend_str = tokenizer.backend_tokenizer.to_str()
+        return {
+            "encoded_vocab": encoded_vocab,
+            "tokenizer_str": backend_str,
+            "stop_token_ids": stop_token_ids
+        }
+    elif ("vocab_file" in tokenizer.vocab_files_names
+          and "tiktoken" in tokenizer.vocab_files_names["vocab_file"]):
+        return {
+            "encoded_vocab": encoded_vocab,
+            "stop_token_ids": stop_token_ids
+        }
+    else:
+        raise ValueError(f"Unsupported tokenizer type: {type(tokenizer)}")
+
+
+def _llguidance_tokenizer_info(tokenizer):
+    tokenizer_info = _xgrammar_tokenizer_info(tokenizer)
+    if tokenizer_info.get("tokenizer_str") is None:
+        raise ValueError("missing tokenizer_str")
+    return tokenizer_info
+
+
+def load_hf_tokenizer(model_dir: str,
+                      trust_remote_code: bool = True,
+                      use_fast: bool = True,
+                      **kwargs) -> Optional[TransformersTokenizer]:
+    ''' Load a tokenizer from a Hugging Face model directory.
+
+    Args:
+        model_dir (str): The model directory.
+        trust_remote_code (bool): Whether to trust the remote code.
+        use_fast (bool): Whether to use the fast tokenizer.
+
+    Returns:
+        A TransformersTokenizer object if the tokenizer is loaded successfully.
+    '''
+
+    try:
+        return TransformersTokenizer.from_pretrained(
+            model_dir,
+            legacy=False,
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=trust_remote_code,
+            use_fast=use_fast,
+            **kwargs)
+
+    except Exception as e:
+        logger.warning(
+            f"Failed to load hf tokenizer from {model_dir}, encounter error: {e}"
+        )
+        return None
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
index 6f2066ee59..4b6f8cedab 100644
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@@ -60,6 +60,10 @@ methods:
         annotation: Optional[str]
         default: null
         status: prototype
+      custom_tokenizer:
+        annotation: Optional[str]
+        default: null
+        status: prototype
       # reasoning
       reasoning_parser:
         annotation: Optional[str]

From ac03915dc3823a4e9a23d4e7a6fe04f7f29f585e Mon Sep 17 00:00:00 2001
From: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
Date: Fri, 19 Dec 2025 17:20:03 +0800
Subject: [PATCH 13/25] [TRTLLM-9604][feat] DS R1 & V3.1 tool parser (#10010)

Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
---
 .../tool_chat_template_deepseekr1.jinja       |  92 ++++++++
 .../tool_chat_template_deepseekv31.jinja      |  91 ++++++++
 .../serve/tool_parser/deepseekv31_parser.py   | 203 +++++++++++++++++
 .../serve/tool_parser/deepseekv3_parser.py    | 207 ++++++++++++++++++
 .../serve/tool_parser/tool_parser_factory.py  |   4 +
 .../unittest/llmapi/apps/test_tool_parsers.py | 204 ++++++++++++++---
 6 files changed, 767 insertions(+), 34 deletions(-)
 create mode 100644 examples/serve/chat_templates/tool_chat_template_deepseekr1.jinja
 create mode 100644 examples/serve/chat_templates/tool_chat_template_deepseekv31.jinja
 create mode 100644 tensorrt_llm/serve/tool_parser/deepseekv31_parser.py
 create mode 100644 tensorrt_llm/serve/tool_parser/deepseekv3_parser.py

diff --git a/examples/serve/chat_templates/tool_chat_template_deepseekr1.jinja b/examples/serve/chat_templates/tool_chat_template_deepseekr1.jinja
new file mode 100644
index 0000000000..45377c4729
--- /dev/null
+++ b/examples/serve/chat_templates/tool_chat_template_deepseekr1.jinja
@@ -0,0 +1,92 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+
+{#- Adapted from https://github.com/sgl-project/sglang/blob/main/examples/chat_template/tool_chat_template_deepseekr1.jinja #}
+{% if tools is defined and tools is not none %}
+    {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. '
+        'When a tool call is needed, you MUST use the following format to issue the call:\n'
+        '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>FUNCTION_NAME\n'
+        '```json\n{"param1": "value1", "param2": "value2"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>\n\n'
+        'Make sure the JSON is valid.'
+        '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %}
+    {% for tool in tools %}
+        {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %}
+    {% endfor %}
+    {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{- bos_token }}
+{{- ns.system_prompt }}
+{%- for message in messages %}
+    {% set content = message['content'] %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + content + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' %}
+        {% if '</think>' in content %}
+            {% set content = content.split('</think>')[-1] %}
+        {% endif %}
+    {% endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{- '<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if content is none %}
+                    {{- '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{- content + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+                {%- set ns.is_first = true -%}
+            {%- else %}
+                {{- '\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{- '<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{- '<｜tool▁outputs▁end｜>' + content + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {{- content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{- '<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{- '\n<｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+    {{- '<｜tool▁outputs▁end｜>'}}
+{%- endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{- '<｜Assistant｜>'}}
+{%- endif %}
diff --git a/examples/serve/chat_templates/tool_chat_template_deepseekv31.jinja b/examples/serve/chat_templates/tool_chat_template_deepseekv31.jinja
new file mode 100644
index 0000000000..08e93a30af
--- /dev/null
+++ b/examples/serve/chat_templates/tool_chat_template_deepseekv31.jinja
@@ -0,0 +1,91 @@
+{% if not add_generation_prompt is defined %}
+  {% set add_generation_prompt = false %}
+{% endif %}
+{% if not thinking is defined %}
+  {% set thinking = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+  {%- if message['role'] == 'system' %}
+    {%- if ns.is_first_sp %}
+      {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+      {% set ns.is_first_sp = false %}
+    {%- else %}
+      {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+    {%- endif %}
+  {%- endif %}
+{%- endfor %}
+
+{% if tools is defined and tools is not none %}
+  {% set tool_ns = namespace(text='## Tools\nYou have access to the following tools:\n') %}
+  {% for tool in tools %}
+    {% set tool_ns.text = tool_ns.text + '\n### ' + tool.function.name + '\nDescription: ' + tool.function.description + '\n\nParameters: ' + (tool.function.parameters | tojson) + '\n' %}
+  {% endfor %}
+  {% set tool_ns.text = tool_ns.text + "\nIMPORTANT: ALWAYS adhere to this exact format for tool use:\n<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>tool_call_name<｜tool▁sep｜>tool_call_arguments<｜tool▁call▁end｜>{{additional_tool_calls}}<｜tool▁calls▁end｜>\n\nWhere:\n\n- `tool_call_name` must be an exact match to one of the available tools\n- `tool_call_arguments` must be valid JSON that strictly follows the tool's Parameters Schema\n- For multiple tool calls, chain them directly without separators or spaces\n" %}
+  {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{ bos_token }}{{ ns.system_prompt }}
+{%- for message in messages %}
+  {%- if message['role'] == 'user' %}
+    {%- set ns.is_tool = false -%}
+    {%- set ns.is_first = false -%}
+    {%- set ns.is_last_user = true -%}
+    {{'<｜User｜>' + message['content']}}
+  {%- endif %}
+  {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+    {%- if ns.is_last_user %}
+      {{'<｜Assistant｜></think>'}}
+    {%- endif %}
+    {%- set ns.is_last_user = false -%}
+    {%- set ns.is_first = false %}
+    {%- set ns.is_tool = false -%}
+    {%- for tool in message['tool_calls'] %}
+      {%- if not ns.is_first %}
+        {%- if message['content'] is none %}
+          {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+        {%- else %}
+          {{message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = true -%}
+      {%- else %}
+        {{'<｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+      {%- endif %}
+    {%- endfor %}
+    {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+  {%- endif %}
+  {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}
+    {%- if ns.is_last_user %}
+      {{'<｜Assistant｜>'}}
+      {%- if message['prefix'] is defined and message['prefix'] and thinking %}
+        {{'<think>'}}
+      {%- else %}
+        {{'</think>'}}
+      {%- endif %}
+    {%- endif %}
+    {%- set ns.is_last_user = false -%}
+    {%- if ns.is_tool %}
+      {{message['content'] + '<｜end▁of▁sentence｜>'}}
+      {%- set ns.is_tool = false -%}
+    {%- else %}
+      {%- set content = message['content'] -%}
+      {%- if '</think>' in content %}
+        {%- set content = content.split('</think>', 1)[1] -%}
+      {%- endif %}
+      {{content + '<｜end▁of▁sentence｜>'}}
+    {%- endif %}
+  {%- endif %}
+  {%- if message['role'] == 'tool' %}
+    {%- set ns.is_last_user = false -%}
+    {%- set ns.is_tool = true -%}
+    {{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+  {%- endif %}
+{%- endfor -%}
+{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}
+  {{'<｜Assistant｜>'}}
+  {%- if not thinking %}
+    {{'</think>'}}
+  {%- else %}
+    {{'<think>'}}
+  {%- endif %}
+{% endif %}
diff --git a/tensorrt_llm/serve/tool_parser/deepseekv31_parser.py b/tensorrt_llm/serve/tool_parser/deepseekv31_parser.py
new file mode 100644
index 0000000000..a0b306a106
--- /dev/null
+++ b/tensorrt_llm/serve/tool_parser/deepseekv31_parser.py
@@ -0,0 +1,203 @@
+# Adapted from https://github.com/sgl-project/sglang/blob/94e1251131ca27260cb0e8938aeb7b4a4e630b19/python/sglang/srt/function_call/deepseekv31_detector.py
+import json
+import re
+from typing import List
+
+from tensorrt_llm.logger import logger
+from tensorrt_llm.serve.openai_protocol import ChatCompletionToolsParam as Tool
+from tensorrt_llm.serve.tool_parser.base_tool_parser import BaseToolParser
+from tensorrt_llm.serve.tool_parser.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+from .utils import is_complete_json
+
+
+class DeepSeekV31Parser(BaseToolParser):
+    (
+        """Tool parser for DeepSeek V3 model function call format.
+
+        The DeepSeek V3 format uses special Unicode tokens to delimit function calls
+        with JSON code blocks for arguments.
+
+        Format Structure:
+        ```
+        <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>{function_name}<｜tool▁sep｜>{json_arguments}<｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+        ```
+        Examples:
+        ```
+        """
+        """<｜tool▁calls▁begin｜>"""
+        """<｜tool▁call▁begin｜>get_current_weather<｜tool▁sep｜>{"location": "Tokyo"}<｜tool▁call▁end｜>"""
+        """<｜tool▁call▁begin｜>get_current_weather<｜tool▁sep｜>{"location": "Paris"}<｜tool▁call▁end｜>"""
+        """<｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+        ```
+
+        Key Components:
+        - Tool Calls Section: Wrapped between `<｜tool▁calls▁begin｜>` and `<｜tool▁calls▁end｜>`
+        - Individual Tool Call: Wrapped between `<｜tool▁call▁begin｜>` and `<｜tool▁call▁end｜>`
+        - Function Declaration: `<｜tool▁call▁begin｜>{function_name}<｜tool▁sep｜>`
+        - Arguments: JSON code block between `<｜tool▁sep｜>` and `<｜tool▁call▁end｜>`
+        - Supports multiple tool calls
+
+        Reference: https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3.1
+    """
+    )
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool▁calls▁begin｜>"  # nosec B105
+        self.eot_token = "<｜tool▁calls▁end｜>"  # nosec B105
+        self.func_call_regex = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+        self.func_detail_regex = r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)<｜tool▁call▁end｜>"
+        self._last_arguments = ""
+        self.current_tool_id = -1
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(1)
+                func_args = func_detail.group(2)
+                func_args = json.loads(func_args)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": func_args}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> StreamingParseResult:
+        """Streaming incremental parsing tool calls for DeepSeekV3 format."""
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = self.bot_token in current_text or "<｜tool▁call▁begin｜>" in current_text
+
+        if not has_tool_call:
+            if any(
+                e_token.startswith(new_text)
+                for e_token in [self.bot_token, "<｜tool▁call▁begin｜>"]
+            ):
+                return StreamingParseResult()
+            self._buffer = ""
+            for e_token in [self.eot_token, "<｜tool▁call▁end｜>"]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            partial_match = re.search(
+                pattern=r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*?)(<｜tool▁call▁end｜>|$)",
+                string=current_text,
+                flags=re.DOTALL,
+            )
+            if partial_match:
+                func_name = partial_match.group(1).strip()
+                func_args_raw = partial_match.group(2).strip()
+                is_tool_end = partial_match.group(3)
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        func_args_raw[len(self._last_arguments) :]
+                        if func_args_raw.startswith(self._last_arguments)
+                        else func_args_raw
+                    )
+
+                    if argument_diff:
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=argument_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[self.current_tool_id] += argument_diff
+
+                    if is_complete_json(func_args_raw):
+                        # Update the stored arguments
+                        try:
+                            parsed_args = json.loads(func_args_raw)
+                            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        if is_tool_end:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[partial_match.end(3) :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin="<｜tool▁call▁begin｜>" + name + "<｜tool▁sep｜>",
+            end="<｜tool▁call▁end｜>",
+            trigger="<｜tool▁call▁begin｜>" + name + "<｜tool▁sep｜>",
+        )
diff --git a/tensorrt_llm/serve/tool_parser/deepseekv3_parser.py b/tensorrt_llm/serve/tool_parser/deepseekv3_parser.py
new file mode 100644
index 0000000000..8eb49eb81c
--- /dev/null
+++ b/tensorrt_llm/serve/tool_parser/deepseekv3_parser.py
@@ -0,0 +1,207 @@
+# Adapted from https://github.com/sgl-project/sglang/blob/94e1251131ca27260cb0e8938aeb7b4a4e630b19/python/sglang/srt/function_call/deepseekv3_detector.py
+import json
+import re
+from typing import List
+
+from tensorrt_llm.logger import logger
+from tensorrt_llm.serve.openai_protocol import ChatCompletionToolsParam as Tool
+from tensorrt_llm.serve.tool_parser.base_tool_parser import BaseToolParser
+from tensorrt_llm.serve.tool_parser.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+from .utils import is_complete_json
+
+
+class DeepSeekV3Parser(BaseToolParser):
+    (
+        r"""Tool parser for DeepSeek V3 model function call format.
+
+    The DeepSeek V3 format uses special Unicode tokens to delimit function calls
+    with JSON code blocks for arguments.
+
+    Format Structure:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>{function_name}\n```json\n{json_arguments}\n```<｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+    Examples:
+    ```
+    """
+        r"""<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n"""
+        r"""```json\n{"location": "Tokyo"}\n```<｜tool▁call▁end｜>\n"""
+        r"""<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n"""
+        r"""```json\n{"location": "Paris"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜>"""
+        r"""
+    ```
+
+    Key Components:
+    - Tool Calls Section: Wrapped between `<｜tool▁calls▁begin｜>` and `<｜tool▁calls▁end｜>`
+    - Individual Tool Call: Wrapped between `<｜tool▁call▁begin｜>` and `<｜tool▁call▁end｜>`
+    - Function Declaration: `function<｜tool▁sep｜>{function_name}`
+    - Arguments: JSON code block between ````json` and ````
+    - Supports multiple tool calls
+
+    Reference: https://huggingface.co/deepseek-ai/DeepSeek-V3-0324?chat_template=default
+    """
+    )
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool▁calls▁begin｜>"  # nosec B105
+        self.eot_token = "<｜tool▁calls▁end｜>"  # nosec B105
+        self.func_call_regex = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+        self.func_detail_regex = (
+            r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)\n```json\n(.*)\n```<｜tool▁call▁end｜>"
+        )
+        self._last_arguments = ""
+        self.current_tool_id = -1
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(2)
+                func_args = func_detail.group(3)
+                func_args = json.loads(func_args)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": func_args}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> StreamingParseResult:
+        """Streaming incremental parsing tool calls for DeepSeekV3 format."""
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = self.bot_token in current_text or "<｜tool▁call▁begin｜>" in current_text
+
+        if not has_tool_call:
+            if any(
+                e_token.startswith(new_text)
+                for e_token in [self.bot_token, "<｜tool▁call▁begin｜>"]
+            ):
+                return StreamingParseResult()
+            self._buffer = ""
+            for e_token in [self.eot_token, "```", "<｜tool▁call▁end｜>"]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            partial_match = re.search(
+                pattern=r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)\n```json\n(.*)\n```.*",
+                string=current_text,
+                flags=re.DOTALL,
+            )
+            if partial_match:
+                func_name = partial_match.group(2).strip()
+                func_args_raw = partial_match.group(3).strip()
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        func_args_raw[len(self._last_arguments) :]
+                        if func_args_raw.startswith(self._last_arguments)
+                        else func_args_raw
+                    )
+
+                    if argument_diff:
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=argument_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[self.current_tool_id] += argument_diff
+
+                    if is_complete_json(func_args_raw):
+                        # Update the stored arguments
+                        try:
+                            parsed_args = json.loads(func_args_raw)
+                            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        tool_call_end_pattern = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+                        match = re.search(tool_call_end_pattern, current_text, re.DOTALL)
+                        if match:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[match.end() :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin=">" + name + "\n```json\n",
+            end="\n```<",
+            trigger=">" + name + "\n```json\n",
+        )
diff --git a/tensorrt_llm/serve/tool_parser/tool_parser_factory.py b/tensorrt_llm/serve/tool_parser/tool_parser_factory.py
index 3cf37c01ff..50bd44b994 100644
--- a/tensorrt_llm/serve/tool_parser/tool_parser_factory.py
+++ b/tensorrt_llm/serve/tool_parser/tool_parser_factory.py
@@ -1,6 +1,8 @@
 from typing import Type
 
 from .base_tool_parser import BaseToolParser
+from .deepseekv3_parser import DeepSeekV3Parser
+from .deepseekv31_parser import DeepSeekV31Parser
 from .kimi_k2_tool_parser import KimiK2ToolParser
 from .qwen3_coder_parser import Qwen3CoderToolParser
 from .qwen3_tool_parser import Qwen3ToolParser
@@ -11,6 +13,8 @@ class ToolParserFactory:
         "qwen3": Qwen3ToolParser,
         "qwen3_coder": Qwen3CoderToolParser,
         "kimi_k2": KimiK2ToolParser,
+        "deepseekv3": DeepSeekV3Parser,
+        "deepseekv3.1": DeepSeekV31Parser,
     }
 
     @staticmethod
diff --git a/tests/unittest/llmapi/apps/test_tool_parsers.py b/tests/unittest/llmapi/apps/test_tool_parsers.py
index 657257e0ca..5c0b03ed08 100644
--- a/tests/unittest/llmapi/apps/test_tool_parsers.py
+++ b/tests/unittest/llmapi/apps/test_tool_parsers.py
@@ -23,6 +23,8 @@ from tensorrt_llm.serve.openai_protocol import (ChatCompletionToolsParam,
                                                 FunctionDefinition)
 from tensorrt_llm.serve.tool_parser.base_tool_parser import BaseToolParser
 from tensorrt_llm.serve.tool_parser.core_types import StructureInfo
+from tensorrt_llm.serve.tool_parser.deepseekv3_parser import DeepSeekV3Parser
+from tensorrt_llm.serve.tool_parser.deepseekv31_parser import DeepSeekV31Parser
 from tensorrt_llm.serve.tool_parser.kimi_k2_tool_parser import KimiK2ToolParser
 from tensorrt_llm.serve.tool_parser.qwen3_coder_parser import \
     Qwen3CoderToolParser
@@ -36,42 +38,42 @@ def sample_tools():
     return [
         ChatCompletionToolsParam(
             type="function",
-            function=FunctionDefinition(name="get_weather",
-                                        description="Get the current weather",
-                                        parameters={
-                                            "type": "object",
-                                            "properties": {
-                                                "location": {
-                                                    "type":
-                                                    "string",
-                                                    "description":
-                                                    "The city and state"
-                                                },
-                                                "unit": {
-                                                    "type":
-                                                    "string",
-                                                    "enum":
-                                                    ["celsius", "fahrenheit"]
-                                                }
-                                            },
-                                            "required": ["location"]
-                                        })),
+            function=FunctionDefinition(
+                name="get_weather",
+                description="Get the current weather",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["location"],
+                },
+            ),
+        ),
         ChatCompletionToolsParam(
             type="function",
-            function=FunctionDefinition(name="search_web",
-                                        description="Search the web",
-                                        parameters={
-                                            "type": "object",
-                                            "properties": {
-                                                "query": {
-                                                    "type":
-                                                    "string",
-                                                    "description":
-                                                    "The search query"
-                                                }
-                                            },
-                                            "required": ["query"]
-                                        })),
+            function=FunctionDefinition(
+                name="search_web",
+                description="Search the web",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "The search query",
+                        }
+                    },
+                    "required": ["query"],
+                },
+            ),
+        ),
     ]
 
 
@@ -1035,6 +1037,140 @@ class TestQwen3CoderToolParser(BaseToolParserTestClass):
         }
 
 
+# ============================================================================
+# DeepSeek Parser Tests
+# ============================================================================
+
+
+class TestDeepSeekV3Parser(BaseToolParserTestClass):
+    """Test suite for DeepSeekV3Parser class."""
+
+    def make_parser(self):
+        return DeepSeekV3Parser()
+
+    def make_tool_parser_test_cases(self):
+        calls_begin = "<｜tool▁calls▁begin｜>"
+        calls_end = "<｜tool▁calls▁end｜>"
+        call_begin = "<｜tool▁call▁begin｜>"
+        call_end = "<｜tool▁call▁end｜>"
+        sep = "<｜tool▁sep｜>"
+
+        single_text = (
+            f"Lead {calls_begin}{call_begin}function{sep}get_weather\n```json\n"
+            f"{json.dumps({'location': 'Tokyo'})}\n```{call_end}{calls_end}")
+        single_expected_normal = "Lead"  # the text is stripped
+        single_expected_name = "get_weather"
+        single_expected_params = {"location": "Tokyo"}
+
+        # Provide one tool to satisfy type hints (tuple[str, tuple[str]])
+        multiple_text = (
+            f"{calls_begin}{call_begin}function{sep}get_weather\n```json\n"
+            f"{json.dumps({'location': 'Paris'})}\n```{call_end}"
+            f"{calls_begin}{call_begin}function{sep}search_web\n```json\n"
+            f"{json.dumps({'query': 'AI'})}\n```{call_end}{calls_end}")
+        multiple_names = ("get_weather", "search_web")
+
+        malformed_text = (
+            f"{calls_begin}{call_begin}function{sep}get_weather\n```json\n"
+            "{'location': 'Paris'}\n```"
+            f"{call_end}{calls_end}")
+
+        with_parameters_key_text = (
+            f"{calls_begin}{call_begin}function{sep}search_web\n```json\n"
+            f"{json.dumps({'parameters': {'query': 'TensorRT'}})}\n```{call_end}{calls_end}"
+        )
+        with_parameters_key_name = "search_web"
+        with_parameters_key_params = {"parameters": {"query": "TensorRT"}}
+
+        partial_bot_token = "<｜tool▁cal"
+
+        undefined_tool_text = (
+            f"{calls_begin}{call_begin}function{sep}unknown\n```json\n"
+            f"{json.dumps({'x': 1})}\n```{call_end}{calls_end}")
+
+        return ToolParserTestCases(
+            has_tool_call_true=f"Hello {calls_begin}",
+            detect_and_parse_single_tool=(
+                single_text,
+                single_expected_normal,
+                single_expected_name,
+                single_expected_params,
+            ),
+            detect_and_parse_multiple_tools=(multiple_text, multiple_names),
+            detect_and_parse_malformed_tool=malformed_text,
+            detect_and_parse_with_parameters_key=(
+                with_parameters_key_text,
+                with_parameters_key_name,
+                with_parameters_key_params,
+            ),
+            parse_streaming_increment_partial_bot_token=partial_bot_token,
+            undefined_tool=undefined_tool_text,
+        )
+
+
+class TestDeepSeekV31Parser(BaseToolParserTestClass):
+    """Test suite for DeepSeekV31Parser class."""
+
+    def make_parser(self):
+        return DeepSeekV31Parser()
+
+    def make_tool_parser_test_cases(self):
+        calls_begin = "<｜tool▁calls▁begin｜>"
+        calls_end = "<｜tool▁calls▁end｜>"
+        call_begin = "<｜tool▁call▁begin｜>"
+        call_end = "<｜tool▁call▁end｜>"
+        sep = "<｜tool▁sep｜>"
+
+        single_text = (
+            f"Intro {calls_begin}{call_begin}get_weather{sep}"
+            f"{json.dumps({'location': 'Tokyo'})}{call_end}{calls_end}")
+        single_expected_normal = "Intro"  # the text is stripped
+        single_expected_name = "get_weather"
+        single_expected_params = {"location": "Tokyo"}
+
+        multiple_text = (f"{calls_begin}{call_begin}get_weather{sep}"
+                         f"{json.dumps({'location': 'Paris'})}{call_end}"
+                         f"{calls_begin}{call_begin}search_web{sep}"
+                         f"{json.dumps({'query': 'AI'})}{call_end}{calls_end}")
+        multiple_names = ("get_weather", "search_web")
+
+        malformed_text = (
+            f"{calls_begin}{call_begin}get_weather{sep}{{'location':'Paris'}}"
+            f"{call_end}{calls_end}")
+
+        with_parameters_key_text = (
+            f"{calls_begin}{call_begin}search_web{sep}"
+            f"{json.dumps({'parameters': {'query': 'TensorRT'}})}{call_end}{calls_end}"
+        )
+        with_parameters_key_name = "search_web"
+        with_parameters_key_params = {"parameters": {"query": "TensorRT"}}
+
+        partial_bot_token = "<｜tool▁cal"
+
+        undefined_tool_text = (
+            f"{calls_begin}{call_begin}unknown{sep}{json.dumps({'x': 1})}{call_end}{calls_end}"
+        )
+
+        return ToolParserTestCases(
+            has_tool_call_true=f"Hi {calls_begin}",
+            detect_and_parse_single_tool=(
+                single_text,
+                single_expected_normal,
+                single_expected_name,
+                single_expected_params,
+            ),
+            detect_and_parse_multiple_tools=(multiple_text, multiple_names),
+            detect_and_parse_malformed_tool=malformed_text,
+            detect_and_parse_with_parameters_key=(
+                with_parameters_key_text,
+                with_parameters_key_name,
+                with_parameters_key_params,
+            ),
+            parse_streaming_increment_partial_bot_token=partial_bot_token,
+            undefined_tool=undefined_tool_text,
+        )
+
+
 # ============================================================================
 # Integration Tests
 # ============================================================================

From dd8ce68c942496666c3ccd7d77053fbb01f0dce1 Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Fri, 19 Dec 2025 17:20:42 +0800
Subject: [PATCH 14/25] [None][infra] Update waive and waive failed tests for
 main branch on 12/19 (#10151)

Signed-off-by: qqiao <qqiao@nvidia.com>
---
 tests/integration/test_lists/waives.txt              | 4 ++--
 tests/unittest/llmapi/apps/_test_openai_responses.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index a5839449ff..6629b6ef38 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -478,8 +478,8 @@ disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKI
 disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/5755963)
 unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516)
 unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521)
-unittest/llmapi/apps/_test_openai_responses.py::test_reasoning_effort[DeepSeek-R1-Distill-Qwen-1.5B] SKIP (https://nvbugs/5753250)
-unittest/llmapi/apps/_test_openai_responses.py::test_multi_turn_chat[Qwen3/Qwen3-0.6B] SKIP (https://nvbugs/5753250)
+cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/5748600)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
 examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979)
 examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
diff --git a/tests/unittest/llmapi/apps/_test_openai_responses.py b/tests/unittest/llmapi/apps/_test_openai_responses.py
index a27e7d1011..a5a26f2067 100644
--- a/tests/unittest/llmapi/apps/_test_openai_responses.py
+++ b/tests/unittest/llmapi/apps/_test_openai_responses.py
@@ -83,6 +83,7 @@ def check_tool_calling(response, first_resp=True, prefix=""):
         assert not tool_call_exist, f"{err_msg} tool call content should not exist! ({function_call})"
 
 
+@pytest.mark.skip(reason="https://nvbugs/5753250")
 @pytest.mark.asyncio(loop_scope="module")
 async def test_reasoning(client: openai.AsyncOpenAI, model: str):
     response = await client.responses.create(
@@ -129,6 +130,7 @@ async def test_chat(client: openai.AsyncOpenAI, model: str):
     check_reponse(response, "test_chat: ")
 
 
+@pytest.mark.skip(reason="https://nvbugs/5753250")
 @pytest.mark.asyncio(loop_scope="module")
 async def test_multi_turn_chat(client: openai.AsyncOpenAI, model: str):
     response = await client.responses.create(model=model,

From 7b51e3cedb2d7fa4a43e297b2dacb496d78dec29 Mon Sep 17 00:00:00 2001
From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
Date: Fri, 19 Dec 2025 17:55:17 +0800
Subject: [PATCH 15/25] [TRTLLM-8638][fix] Add failed cases into waives.txt
 (#10129)

Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 6629b6ef38..8f52849d3c 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -492,3 +492,4 @@ examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enab
 examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (https://nvbugs/5754976)
 examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5568052)
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/5588376)
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5756008)

From 9f6abaf59f68fe42c2bc8d564efa32fb881d733e Mon Sep 17 00:00:00 2001
From: tcherckez-nvidia <127761168+tcherckez-nvidia@users.noreply.github.com>
Date: Fri, 19 Dec 2025 15:30:02 +0200
Subject: [PATCH 16/25] [#9640][feat] Migrate model registry to v2.0 format
 with composable configs (#9836)

Signed-off-by: Tal Cherckez <127761168+tcherckez-nvidia@users.noreply.github.com>
---
 examples/auto_deploy/.gitignore               |   1 +
 examples/auto_deploy/model_registry/README.md | 160 +++++++++++
 .../configs/dashboard_default.yaml            |   9 +
 .../configs/deepseek_v3_lite.yaml             |   4 +
 .../configs/demollm_triton.yaml               |   4 +
 .../model_registry/configs/gemma3_1b.yaml     |   3 +
 .../model_registry/configs/llama3_3_70b.yaml  |  10 +
 .../configs/llama4_maverick_lite.yaml         |   5 +
 .../model_registry/configs/llama4_scout.yaml  |  10 +
 .../model_registry/configs/multimodal.yaml    |   2 +
 .../model_registry/configs/openelm.yaml       |   3 +
 .../configs/simple_shard_only.yaml            |   5 +
 .../model_registry/configs/world_size_1.yaml  |   2 +
 .../model_registry/configs/world_size_2.yaml  |   2 +
 .../model_registry/configs/world_size_4.yaml  |   2 +
 .../model_registry/configs/world_size_8.yaml  |   2 +
 .../auto_deploy/model_registry/models.yaml    | 248 ++++++++++++++++++
 17 files changed, 472 insertions(+)
 create mode 100644 examples/auto_deploy/model_registry/README.md
 create mode 100644 examples/auto_deploy/model_registry/configs/dashboard_default.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/deepseek_v3_lite.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/demollm_triton.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/gemma3_1b.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/llama4_maverick_lite.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/llama4_scout.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/multimodal.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/openelm.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/simple_shard_only.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/world_size_1.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/world_size_2.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/world_size_4.yaml
 create mode 100644 examples/auto_deploy/model_registry/configs/world_size_8.yaml
 create mode 100644 examples/auto_deploy/model_registry/models.yaml

diff --git a/examples/auto_deploy/.gitignore b/examples/auto_deploy/.gitignore
index f15c233aee..0999a4ed76 100644
--- a/examples/auto_deploy/.gitignore
+++ b/examples/auto_deploy/.gitignore
@@ -6,3 +6,4 @@ benchmark_results.json
 *.yaml
 !nano_v3.yaml
 !nemotron_flash.yaml
+!model_registry/configs/*.yaml
diff --git a/examples/auto_deploy/model_registry/README.md b/examples/auto_deploy/model_registry/README.md
new file mode 100644
index 0000000000..0c5756fca9
--- /dev/null
+++ b/examples/auto_deploy/model_registry/README.md
@@ -0,0 +1,160 @@
+# AutoDeploy Model Registry
+
+The AutoDeploy model registry provides a comprehensive, maintainable list of supported models for testing and coverage tracking.
+
+## Format
+
+**Version: 2.0** (Flat format with composable configurations)
+
+### Structure
+
+```yaml
+version: '2.0'
+description: AutoDeploy Model Registry - Flat format with composable configs
+models:
+- name: meta-llama/Llama-3.1-8B-Instruct
+  yaml_extra: [dashboard_default.yaml, world_size_2.yaml]
+
+- name: meta-llama/Llama-3.3-70B-Instruct
+  yaml_extra: [dashboard_default.yaml, world_size_4.yaml, llama-3.3-70b.yaml]
+
+- name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+  yaml_extra: [dashboard_default.yaml, world_size_2.yaml, demollm_triton.yaml]
+```
+
+### Key Concepts
+
+- **Flat list**: Models are in a single flat list (not grouped)
+- **Composable configs**: Each model references YAML config files via `yaml_extra`
+- **Deep merging**: Config files are merged in order (later files override earlier ones)
+- **No inline args**: All configuration is in YAML files for reusability
+
+## Configuration Files
+
+Config files are stored in `configs/` subdirectory and define runtime parameters:
+
+### Core Configs
+
+| File | Purpose | Example Use |
+|------|---------|-------------|
+| `dashboard_default.yaml` | Baseline settings for all models | Always first in yaml_extra |
+| `world_size_N.yaml` | GPU count (1, 2, 4, 8) | Defines tensor_parallel_size |
+
+### Runtime Configs
+
+| File | Purpose |
+|------|---------|
+| `multimodal.yaml` | Vision + text models |
+| `demollm_triton.yaml` | DemoLLM runtime with Triton backend |
+| `simple_shard_only.yaml` | Large models requiring simple sharding
+
+### Model-Specific Configs
+
+| File | Purpose |
+|------|---------|
+| `llama-3.3-70b.yaml` | Optimized settings for Llama 3.3 70B |
+| `nano_v3.yaml` | Settings for Nemotron Nano V3 |
+| `llama-4-scout.yaml` | Settings for Llama 4 Scout |
+| `openelm.yaml` | Apple OpenELM (custom tokenizer) |
+| `gemma3_1b.yaml` | Gemma 3 1B (sequence length) |
+| `deepseek_v3_lite.yaml` | DeepSeek V3/R1 (reduced layers) |
+| `llama4_maverick_lite.yaml` | Llama 4 Maverick (reduced layers) |
+
+## Adding a New Model
+
+### Simple Model (Standard Config)
+
+```yaml
+- name: organization/my-new-model-7b
+  yaml_extra: [dashboard_default.yaml, world_size_2.yaml]
+```
+
+### Model with Special Requirements
+
+```yaml
+- name: organization/my-multimodal-model
+  yaml_extra: [dashboard_default.yaml, world_size_4.yaml, multimodal.yaml]
+```
+
+### Model with Custom Config
+
+1. Create `configs/my_model.yaml`:
+
+```yaml
+# Custom settings for my model
+max_batch_size: 2048
+kv_cache_free_gpu_memory_fraction: 0.95
+cuda_graph_config:
+  enable_padding: true
+```
+
+2. Reference it in `models.yaml`:
+
+```yaml
+- name: organization/my-custom-model
+  yaml_extra: [dashboard_default.yaml, world_size_8.yaml, my_model.yaml]
+```
+
+## Config Merging
+
+Configs are merged in order. Example:
+
+```yaml
+yaml_extra:
+  - dashboard_default.yaml    # baseline: runtime=trtllm, benchmark_enabled=true
+  - world_size_2.yaml         # adds: tensor_parallel_size=2
+  - openelm.yaml              # overrides: tokenizer=llama-2, benchmark_enabled=false
+```
+
+**Result**: `runtime=trtllm, tensor_parallel_size=2, tokenizer=llama-2, benchmark_enabled=false`
+
+## World Size Guidelines
+
+| World Size | Model Size Range | Example Models |
+|------------|------------------|----------------|
+| 1 | \< 2B params | TinyLlama, Qwen 0.5B, Phi-4-mini |
+| 2 | 2-15B params | Llama 3.1 8B, Qwen 7B, Mistral 7B |
+| 4 | 20-80B params | Llama 3.3 70B, QwQ 32B, Gemma 27B |
+| 8 | 80B+ params | DeepSeek V3, Llama 405B, Nemotron Ultra |
+
+## Model Coverage
+
+The registry contains models distributed across different GPU configurations (world sizes 1, 2, 4, and 8), including both text-only and multimodal models.
+
+**To verify current model counts and coverage:**
+
+```bash
+cd /path/to/autodeploy-dashboard
+python3 scripts/prepare_model_coverage_v2.py \
+    --source local \
+    --local-path /path/to/TensorRT-LLM \
+    --output /tmp/model_coverage.yaml
+
+# View summary
+grep -E "^- name:|yaml_extra:" /path/to/TensorRT-LLM/examples/auto_deploy/model_registry/models.yaml | wc -l
+```
+
+When adding or removing models, use `prepare_model_coverage_v2.py` to validate the registry structure and coverage.
+
+## Best Practices
+
+1. **Always include `dashboard_default.yaml` first** - it provides baseline settings
+1. **Always include a `world_size_N.yaml`** - defines GPU count
+1. **Add special configs after world_size** - they override defaults
+1. **Create reusable configs** - if 3+ models need same settings, make a config file
+1. **Use model-specific configs sparingly** - only for unique requirements
+1. **Test before committing** - verify with `prepare_model_coverage_v2.py`
+
+## Testing Changes
+
+```bash
+# Generate workload from local changes
+cd /path/to/autodeploy-dashboard
+python3 scripts/prepare_model_coverage_v2.py \
+    --source local \
+    --local-path /path/to/TensorRT-LLM \
+    --output /tmp/test_workload.yaml
+
+# Verify output
+cat /tmp/test_workload.yaml
+```
diff --git a/examples/auto_deploy/model_registry/configs/dashboard_default.yaml b/examples/auto_deploy/model_registry/configs/dashboard_default.yaml
new file mode 100644
index 0000000000..6d22bc2a43
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/dashboard_default.yaml
@@ -0,0 +1,9 @@
+# Default configuration for all AutoDeploy dashboard tests
+# These are baseline settings that apply to all models unless overridden
+
+runtime: trtllm
+attn_backend: flashinfer
+compile_backend: torch-compile
+model_factory: AutoModelForCausalLM
+skip_loading_weights: false
+max_seq_len: 512
diff --git a/examples/auto_deploy/model_registry/configs/deepseek_v3_lite.yaml b/examples/auto_deploy/model_registry/configs/deepseek_v3_lite.yaml
new file mode 100644
index 0000000000..8475097ba2
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/deepseek_v3_lite.yaml
@@ -0,0 +1,4 @@
+# Configuration for DeepSeek V3 and R1 with reduced layers
+# Full models are too large, so we test with limited layers
+model_kwargs:
+  num_hidden_layers: 10
diff --git a/examples/auto_deploy/model_registry/configs/demollm_triton.yaml b/examples/auto_deploy/model_registry/configs/demollm_triton.yaml
new file mode 100644
index 0000000000..6f0d9a7326
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/demollm_triton.yaml
@@ -0,0 +1,4 @@
+# Configuration for DemoLLM runtime with Triton backend
+# Used for experimental or specific model requirements
+runtime: demollm
+attn_backend: triton
diff --git a/examples/auto_deploy/model_registry/configs/gemma3_1b.yaml b/examples/auto_deploy/model_registry/configs/gemma3_1b.yaml
new file mode 100644
index 0000000000..d076697e8a
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/gemma3_1b.yaml
@@ -0,0 +1,3 @@
+# Configuration for Gemma 3 1B model
+# Specific sequence length requirement due to small attention window
+max_seq_len: 511
diff --git a/examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml b/examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml
new file mode 100644
index 0000000000..828800c93b
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/llama3_3_70b.yaml
@@ -0,0 +1,10 @@
+# Configuration for Llama 3.3 70B
+# AutoDeploy-specific settings for large Llama models
+
+max_batch_size: 1024
+max_num_tokens: 2048
+free_mem_ratio: 0.9
+trust_remote_code: true
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024]
+kv_cache_config:
+  dtype: fp8
diff --git a/examples/auto_deploy/model_registry/configs/llama4_maverick_lite.yaml b/examples/auto_deploy/model_registry/configs/llama4_maverick_lite.yaml
new file mode 100644
index 0000000000..24372fa5cd
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/llama4_maverick_lite.yaml
@@ -0,0 +1,5 @@
+# Configuration for Llama 4 Maverick with reduced layers
+# Full model is too large for testing
+model_kwargs:
+  text_config:
+    num_hidden_layers: 5
diff --git a/examples/auto_deploy/model_registry/configs/llama4_scout.yaml b/examples/auto_deploy/model_registry/configs/llama4_scout.yaml
new file mode 100644
index 0000000000..25b5c98971
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/llama4_scout.yaml
@@ -0,0 +1,10 @@
+# Configuration for Llama 4 Scout (VLM)
+# AutoDeploy-specific settings for Llama 4 Scout MoE vision model
+
+max_batch_size: 1024
+max_num_tokens: 2048
+free_mem_ratio: 0.9
+trust_remote_code: true
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024]
+kv_cache_config:
+  dtype: fp8
diff --git a/examples/auto_deploy/model_registry/configs/multimodal.yaml b/examples/auto_deploy/model_registry/configs/multimodal.yaml
new file mode 100644
index 0000000000..0220389c92
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/multimodal.yaml
@@ -0,0 +1,2 @@
+# Configuration for multimodal (vision + text) models
+model_factory: AutoModelForImageTextToText
diff --git a/examples/auto_deploy/model_registry/configs/openelm.yaml b/examples/auto_deploy/model_registry/configs/openelm.yaml
new file mode 100644
index 0000000000..848b125465
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/openelm.yaml
@@ -0,0 +1,3 @@
+# Configuration for Apple OpenELM models
+# These models require Llama-2 tokenizer
+tokenizer: meta-llama/Llama-2-7b-hf
diff --git a/examples/auto_deploy/model_registry/configs/simple_shard_only.yaml b/examples/auto_deploy/model_registry/configs/simple_shard_only.yaml
new file mode 100644
index 0000000000..518cfcb219
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/simple_shard_only.yaml
@@ -0,0 +1,5 @@
+# Configuration for models that require simple sharding only
+# Used for very large models with specific sharding requirements
+transforms:
+  detect_sharding:
+    simple_shard_only: true
diff --git a/examples/auto_deploy/model_registry/configs/world_size_1.yaml b/examples/auto_deploy/model_registry/configs/world_size_1.yaml
new file mode 100644
index 0000000000..266ced60fc
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/world_size_1.yaml
@@ -0,0 +1,2 @@
+# Configuration for single GPU models
+world_size: 1
diff --git a/examples/auto_deploy/model_registry/configs/world_size_2.yaml b/examples/auto_deploy/model_registry/configs/world_size_2.yaml
new file mode 100644
index 0000000000..ba7a36dda3
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/world_size_2.yaml
@@ -0,0 +1,2 @@
+# Configuration for 2 GPU models
+world_size: 2
diff --git a/examples/auto_deploy/model_registry/configs/world_size_4.yaml b/examples/auto_deploy/model_registry/configs/world_size_4.yaml
new file mode 100644
index 0000000000..1a6da8c44e
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/world_size_4.yaml
@@ -0,0 +1,2 @@
+# Configuration for 4 GPU models
+world_size: 4
diff --git a/examples/auto_deploy/model_registry/configs/world_size_8.yaml b/examples/auto_deploy/model_registry/configs/world_size_8.yaml
new file mode 100644
index 0000000000..d978b0bcd4
--- /dev/null
+++ b/examples/auto_deploy/model_registry/configs/world_size_8.yaml
@@ -0,0 +1,2 @@
+# Configuration for 8 GPU models
+world_size: 8
diff --git a/examples/auto_deploy/model_registry/models.yaml b/examples/auto_deploy/model_registry/models.yaml
new file mode 100644
index 0000000000..1ec27706db
--- /dev/null
+++ b/examples/auto_deploy/model_registry/models.yaml
@@ -0,0 +1,248 @@
+version: '2.0'
+description: AutoDeploy Model Registry - Flat format with composable configs
+models:
+- name: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml']
+- name: Qwen/Qwen2.5-0.5B-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml']
+- name: Qwen/Qwen3-0.6B
+  yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml']
+# DISABLED: TorchDynamo compilation error - fake tensor dispatch failure
+# - name: apple/OpenELM-270M-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'openelm.yaml']
+# DISABLED: TorchDynamo compilation error - fake tensor dispatch failure
+# - name: apple/OpenELM-1_1B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'openelm.yaml']
+# DISABLED: TorchDynamo compilation error - fake tensor dispatch failure
+# - name: apple/OpenELM-3B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'openelm.yaml']
+- name: microsoft/Phi-4-mini-instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml']
+- name: microsoft/Phi-4-mini-reasoning
+  yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml']
+- name: google/gemma-3-1b-it
+  yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'gemma3_1b.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: meta-llama/Llama-3.1-8B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: casperhansen/llama-3-8b-instruct-awq
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: meta-llama/Llama-3.2-1B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: meta-llama/Llama-3.2-3B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: Qwen/Qwen2.5-1.5B-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: Qwen/Qwen2.5-3B-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: Qwen/Qwen2.5-7B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: Qwen/Qwen2.5-7B-Instruct-AWQ
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: Qwen/Qwen3-4B
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: Qwen/Qwen3-8B
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: microsoft/phi-4
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: microsoft/Phi-4-reasoning
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: microsoft/Phi-4-reasoning-plus
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: google/gemma-1.1-7b-it
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: google/gemma-2-2b-it
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: google/gemma-2-9b-it
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: google/codegemma-7b-it
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: mistralai/Mistral-7B-Instruct-v0.2
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: mistralai/Mistral-7B-Instruct-v0.3
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: TheBloke/Mistral-7B-Instruct-v0.2-GPTQ
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: bigcode/starcoder2-7b
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: bigcode/starcoder2-15b-instruct-v0.1
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: deepseek-ai/DeepSeek-Prover-V1.5-SFT
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: deepseek-ai/DeepSeek-Prover-V2-7B
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: ibm-granite/granite-3.1-2b-instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: ibm-granite/granite-3.1-8b-instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: ibm-granite/granite-3.3-2b-instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: ibm-granite/granite-3.3-8b-instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: ibm-granite/granite-guardian-3.1-2b
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: ibm-granite/granite-guardian-3.2-5b
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: meta-llama/CodeLlama-7b-Instruct-hf
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: meta-llama/CodeLlama-7b-Python-hf
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: meta-llama/Llama-2-7b-chat-hf
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: FakeTensorMode error in unified_attn export
+# - name: nvidia/Llama-3.1-8B-Instruct-FP8
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: nvidia/Llama-3.1-Minitron-4B-Depth-Base
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: nvidia/Llama-3.1-Minitron-4B-Width-Base
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: nvidia/Llama-3.1-Nemotron-Nano-8B-v1
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: nvidia/Mistral-NeMo-Minitron-8B-Base
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: openai/gpt-oss-20b
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: Custom op error - append_paged_kv_cache missing Float kernel
+# - name: bigcode/starcoder2-15b
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: ibm-granite/granite-3.0-8b-instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: mistralai/Ministral-8B-Instruct-2410
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: nvidia/NVIDIA-Nemotron-Nano-9B-v2-FP8
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
+- name: google/gemma-3-27b-it
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
+- name: google/gemma-3-2b-it
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+- name: deepseek-ai/DeepSeek-V2.5
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: Network timeout downloading from Hugging Face
+# - name: ai21labs/AI21-Jamba-1.5-Mini
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: THUDM/glm-4v-9b
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: meta-llama/Llama-3.2-11B-Vision-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
+# DISABLED: Auto-deploy compilation error
+# - name: meta-llama/Llama-3.3-70B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'llama3_3_70b.yaml']
+- name: meta-llama/CodeLlama-34b-Instruct-hf
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: meta-llama/Llama-2-13b-chat-hf
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: microsoft/Phi-3-medium-128k-instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: microsoft/Phi-3-medium-4k-instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: mistralai/Codestral-22B-v0.1
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+# DISABLED: Graph transformation error in auto-deploy
+# - name: neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: TheBloke/falcon-40b-instruct-GPTQ
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: Qwen/QwQ-32B
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: google/gemma-2-27b-it
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: perplexity-ai/r1-1776-distill-llama-70b
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'nano_v3.yaml']
+- name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: Qwen/QwQ-32B-Preview
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: Qwen/Qwen3-Coder-32B-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: Qwen/Qwen3-235B-A22B-Instruct-2507
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+# DISABLED: Network timeout downloading from Hugging Face
+# - name: ai21labs/AI21-Jamba-1.5-Large
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+- name: nvidia/OpenReasoning-Nemotron-32B
+  yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+# DISABLED: Auto-deploy compilation error
+# - name: mistralai/Mistral-Large-Instruct-v2.1
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+# DISABLED: Auto-deploy compilation error
+# - name: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+# DISABLED: Graph transformation error in auto-deploy
+# - name: mistralai/Mixtral-8x22B-Instruct-v0.1
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+# DISABLED: FakeTensorMode error in unified_attn export
+# - name: nvidia/Llama-3.1-70B-Instruct-FP8
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+# DISABLED: FakeTensorMode error in unified_attn export
+# - name: nvidia/Llama-3.1-405B-Instruct-FP8
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+- name: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+# DISABLED: Model loading failure - dynamic module registry issue
+# - name: nvidia/Llama-3_1-Nemotron-51B-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml']
+- name: nvidia/Llama-3_1-Nemotron-Ultra-253B-v1
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml']
+- name: nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml']
+- name: nvidia/Llama-3_3-Nemotron-Super-49B-v1
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml']
+- name: Qwen/Qwen3-30B-A3B
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml']
+- name: Qwen/Qwen3-235B-A22B
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'simple_shard_only.yaml']
+- name: deepseek-ai/DeepSeek-R1
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'deepseek_v3_lite.yaml']
+# DISABLED: Auto-deploy compilation error
+# - name: deepseek-ai/DeepSeek-V3
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'deepseek_v3_lite.yaml']
+# DISABLED: Assertion failure in auto-deploy transform pipeline
+# - name: deepseek-ai/DeepSeek-Coder-V2-Instruct
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+- name: Qwen/Qwen3-VL-8B-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+- name: Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml']
+# DISABLED: SLURM cluster cancellation - infrastructure issue
+# - name: codellama/CodeLlama-70b-Instruct-hf
+#   yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+- name: meta-llama/Llama-3.2-90B-Vision-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml']
+- name: openai/gpt-oss-120b
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml']
+- name: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_scout.yaml']
+- name: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_maverick_lite.yaml']

From 27e49e290427e845abf44e9705ce8abf0c0a81f2 Mon Sep 17 00:00:00 2001
From: xxi <95731198+xxi-nv@users.noreply.github.com>
Date: Fri, 19 Dec 2025 22:14:26 +0800
Subject: [PATCH 17/25] =?UTF-8?q?[None][fix]=20waive=20the=20failed=20test?=
 =?UTF-8?q?=20test=5Fservice=5Fdiscovery[etcd-load=5Fba=E2=80=A6=20(#10161?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: xxi <xxi@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 8f52849d3c..9fd9e00822 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -476,6 +476,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_pref
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5702793)
 disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/5748564)
 disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/5755963)
+disaggregated/test_auto_scaling.py::test_service_discovery[etcd-load_balancing] SKIP (https://nvbugs/5757415)
 unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516)
 unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521)
 cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)

From 7b71ff6b8a378c8888da0ba1757cbcc65a7b9987 Mon Sep 17 00:00:00 2001
From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com>
Date: Sat, 20 Dec 2025 00:19:20 +0800
Subject: [PATCH 18/25] [https://nvbugs/5722653][fix] Unwaive fixed test
 (#10157)

Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 9fd9e00822..4d2aa7ab7b 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -435,7 +435,6 @@ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377, https://nvbugs/5740075)
-disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740087)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)

From dfa11d810e72adac2d21073ae7644c007ce673cd Mon Sep 17 00:00:00 2001
From: Venky <23023424+venkywonka@users.noreply.github.com>
Date: Sat, 20 Dec 2025 00:18:43 +0530
Subject: [PATCH 19/25] [TRTC-102][docs] `--extra_llm_api_options`->`--config`
 in docs/examples/tests (#10005)

---
 .gitignore                                    |   2 +-
 .../note_sections.rst                         |  13 +-
 ...practice_on_DeepSeek-R1_in_TensorRT-LLM.md |  24 +-
 .../blogs/tech_blog/blog11_GPT_OSS_Eagle3.md  |   2 +-
 ..._R1_MTP_Implementation_and_Optimization.md |   8 +-
 ...ling_Expert_Parallelism_in_TensorRT-LLM.md |   8 +-
 .../blog6_Llama4_maverick_eagle_guide.md      |   2 +-
 .../blog9_Deploying_GPT_OSS_on_TRTLLM.md      |  14 +-
 docs/source/commands/trtllm-bench.rst         |  20 +-
 docs/source/commands/trtllm-eval.rst          |   4 +
 .../run-benchmark-with-trtllm-serve.md        |  66 ++--
 .../commands/trtllm-serve/trtllm-serve.rst    |  20 +-
 docs/source/deployment-guide/config_table.rst | 338 +++++++++---------
 ...loyment-guide-for-deepseek-r1-on-trtllm.md |   8 +-
 .../deployment-guide-for-gpt-oss-on-trtllm.md |   8 +-
 ...nt-guide-for-kimi-k2-thinking-on-trtllm.md |   2 +-
 ...oyment-guide-for-llama3.3-70b-on-trtllm.md |   6 +-
 ...oyment-guide-for-llama4-scout-on-trtllm.md |   6 +-
 ...ployment-guide-for-qwen3-next-on-trtllm.md |   8 +-
 .../deployment-guide-for-qwen3-on-trtllm.md   |  10 +-
 docs/source/deployment-guide/index.rst        |  22 +-
 .../developer-guide/perf-benchmarking.md      |  13 +-
 docs/source/developer-guide/perf-overview.md  |   6 +-
 .../benchmarking_with_trtllm_bench.md         |  10 +-
 docs/source/features/disagg-serving.md        |  22 +-
 docs/source/features/guided-decoding.md       |  12 +-
 docs/source/features/lora.md                  |  20 +-
 docs/source/features/parallel-strategy.md     |   2 +-
 docs/source/features/speculative-decoding.md  |   8 +-
 .../torch_compile_and_piecewise_cuda_graph.md |  78 ++--
 docs/source/helper.py                         |  11 +-
 .../legacy/performance/perf-benchmarking.md   |  12 +-
 .../benchmarking_with_trtllm_bench.md         |   4 +-
 .../advanced/serving_with_trtllm_serve.md     |   4 +-
 docs/source/torch/features/lora.md            |   8 +-
 examples/__init__.py                          |  14 +
 examples/configs/README.md                    |   2 +-
 examples/configs/__init__.py                  |  14 +
 examples/configs/database/__init__.py         |  14 +
 examples/disaggregated/README.md              |  36 +-
 .../slurm/benchmark/start_worker.sh           |   2 +-
 .../service_discovery_example/launch.slurm    |   8 +-
 .../slurm/simple_example/launch.slurm         |   4 +-
 examples/llm-api/llm_mgmn_trtllm_bench.sh     |   2 +-
 examples/models/core/deepseek_v3/README.md    |  52 +--
 examples/models/core/gemma/README.md          |   8 +-
 examples/models/core/gpt_oss/README.md        |   2 +-
 examples/models/core/kimi_k2/README.md        |   4 +-
 examples/models/core/llama/README.md          |   4 +-
 examples/models/core/llama4/README.md         |  12 +-
 .../models/core/mistral_large_3/README.md     |   2 +-
 examples/models/core/multimodal/README.md     |   2 +-
 .../models/core/nemotron/README_nano-v2-vl.md |   6 +-
 examples/models/core/phi/phi4-mm.md           |   4 +-
 examples/models/core/qwen/README.md           |  12 +-
 .../disaggregated/disagg_serving_local.sh     |   4 +-
 .../serve/deepseek_r1_reasoning_parser.sh     |   4 +-
 .../openai_completion_client_json_schema.py   |   2 +-
 examples/sparse_attention/RocketKV.md         |   6 +-
 examples/wide_ep/ep_load_balancer/README.md   |  12 +-
 scripts/generate_config_table.py              |  17 +-
 .../accuracy/test_disaggregated_serving.py    |   4 +-
 .../defs/disaggregated/test_auto_scaling.py   |   2 +-
 .../defs/disaggregated/test_disaggregated.py  |   4 +-
 .../disaggregated/test_disaggregated_etcd.py  |   4 +-
 .../defs/perf/README_release_test.md          |   4 +-
 tests/integration/defs/perf/test_perf.py      |  16 +-
 .../defs/stress_test/stress_test.py           |   2 +-
 tests/integration/defs/test_e2e.py            |   8 +-
 .../tools/test_config_database_sync.py        |  29 +-
 70 files changed, 625 insertions(+), 498 deletions(-)
 rename docs/source/{deployment-guide => _includes}/note_sections.rst (75%)
 create mode 100644 examples/__init__.py
 create mode 100644 examples/configs/__init__.py
 create mode 100644 examples/configs/database/__init__.py

diff --git a/.gitignore b/.gitignore
index 130ea9837b..7f7ffd18c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,7 +56,7 @@ tensorrt_llm/scripts
 docs/source/**/*.rst
 !docs/source/examples/index.rst
 !docs/source/deployment-guide/config_table.rst
-!docs/source/deployment-guide/note_sections.rst
+!docs/source/_includes/note_sections.rst
 *.swp
 
 # Testing
diff --git a/docs/source/deployment-guide/note_sections.rst b/docs/source/_includes/note_sections.rst
similarity index 75%
rename from docs/source/deployment-guide/note_sections.rst
rename to docs/source/_includes/note_sections.rst
index 4cd0d1c41d..d0b1657638 100644
--- a/docs/source/deployment-guide/note_sections.rst
+++ b/docs/source/_includes/note_sections.rst
@@ -1,11 +1,20 @@
 ..
-   Reusable note sections for deployment guides.
+   Reusable note sections for docs.
    Include specific notes using:
 
-   .. include:: note_sections.rst
+   .. include:: <path-to>/note_sections.rst
       :start-after: .. start-note-<name>
       :end-before: .. end-note-<name>
 
+.. start-note-config-flag-alias
+
+.. note::
+
+   **Non-breaking**: ``--config <file.yaml>`` is the preferred flag for passing a :ref:`YAML configuration file <configuring-with-yaml-files>`.
+   Existing workflows using ``--extra_llm_api_options <file.yaml>`` continue to work; it is an equivalent alias.
+
+.. end-note-config-flag-alias
+
 .. start-note-traffic-patterns
 
 .. note::
diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
index ad0e9975a1..7072f770bf 100644
--- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
@@ -139,7 +139,7 @@ To do the benchmark, run the following command:
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 moe_config:
   backend: TRTLLM
 speculative_config:
@@ -157,7 +157,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     --max_batch_size 1 \
     --tp 8 \
     --ep 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 Explanation:
@@ -168,7 +168,7 @@ Explanation:
 - `--max_batch_size`: Max batch size in each rank.
 - `--tp`: Tensor parallel size.
 - `--ep`: Expert parallel size.
-- `--extra_llm_api_options`: Used to specify some extra config. The content of the file is as follows:
+- `--config`: Used to specify extra YAML configuration. The content of the file is as follows:
 
 #### Expected Results
 The perf can be different when using different datasets and different machines.
@@ -195,7 +195,7 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
 
 #### Benchmark
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -218,7 +218,7 @@ trtllm-bench  --model nvidia/DeepSeek-R1-0528-FP4
      throughput
      --dataset ${YOUR_DATA_PATH}
      --tp 8  --ep 8
-     --extra_llm_api_options ./extra-llm-api-config.yml
+     --config ./config.yml
      --max_batch_size 896
      --max_num_tokens 2048
      --kv_cache_free_gpu_mem_fraction 0.93
@@ -261,7 +261,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
 
 YOUR_DATA_PATH=./dataset.txt
 
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -290,7 +290,7 @@ trtllm-bench -m nvidia/DeepSeek-R1-FP4 \
     --num_requests 49152 \
     --concurrency 3072 \
     --kv_cache_free_gpu_mem_fraction 0.85 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 #### Expected Result Format
@@ -315,7 +315,7 @@ To do the benchmark, run the following command:
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -329,7 +329,7 @@ trtllm-bench --model deepseek-ai/DeepSeek-R1 \
     --tp 8 \
     --ep 4 \
     --concurrency 1 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 #### Expected Result Format
@@ -363,7 +363,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
 
 YOUR_DATA_PATH=./dataset.txt
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 cuda_graph_config:
   batch_sizes:
   - 128
@@ -384,7 +384,7 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 \
     --num_requests 5120 \
     --concurrency 1024 \
     --kv_cache_free_gpu_mem_fraction 0.8 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 #### Expected Result Format
@@ -408,7 +408,7 @@ Average request latency (ms):                     181540.5739
 To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use the `trtllm-bench prepare-dataset` subcommand to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother.
 ### WIP: Enable more features by default
 
-Currently, there are some features that need to be enabled through a user-defined file `extra-llm-api-config.yml`, such as attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.
+Currently, there are some features that need to be enabled through a user-defined file `config.yml`, such as attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.
 
 Note that, `max_batch_size` and `max_num_tokens` can easily affect the performance. The default values for them are already carefully designed and should deliver good performance on overall cases, however, you may still need to tune it for peak performance.
 
diff --git a/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md b/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md
index 47ac67d24f..3b2ddfa782 100644
--- a/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md
+++ b/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md
@@ -105,7 +105,7 @@ Notes:
 Run the following command inside the container to start the endpoint:
 
 ```bash
-TRTLLM_ENABLE_PDL=1 trtllm-serve /config/models/gpt-oss-120b --host 0.0.0.0 --port 8000 --max_batch_size 10  --tp_size 8 --ep_size 4 --trust_remote_code --extra_llm_api_options /config/models/eagle/eagle.yaml --max_num_tokens 131072 --max_seq_len 131072
+TRTLLM_ENABLE_PDL=1 trtllm-serve /config/models/gpt-oss-120b --host 0.0.0.0 --port 8000 --max_batch_size 10  --tp_size 8 --ep_size 4 --trust_remote_code --config /config/models/eagle/eagle.yaml --max_num_tokens 131072 --max_seq_len 131072
 ```
 
 The server initializes, loads, and optimizes the models. After it is ready, it listens on port 8000.
diff --git a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
index 479e3db2c2..70318e6c20 100644
--- a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
+++ b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
@@ -122,7 +122,7 @@ To benchmark min-latency performance with MTP, you need to follow [this document
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 cuda_graph_config: {}
 moe_config:
   backend: TRTLLM
@@ -142,7 +142,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     --max_batch_size 1 \
     --tp 8 \
     --ep 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 ## MTP optimization - Relaxed Acceptance
@@ -178,7 +178,7 @@ To benchmark min-latency performance with MTP Relaxed Acceptance, you need to fo
 ```bash
 YOUR_DATA_PATH=<your dataset file following the format>
 
-cat >./extra-llm-api-config.yml<<EOF
+cat >./config.yml<<EOF
 cuda_graph_config: {}
 moe_config:
   backend: TRTLLM
@@ -201,7 +201,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
     --max_batch_size 1 \
     --tp 8 \
     --ep 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 ## Evaluation
diff --git a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
index c0dbcea6bc..b887f6d024 100644
--- a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
+++ b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
@@ -541,7 +541,7 @@ Prepare a dataset following the [benchmarking documentation](https://github.com/
 Run 32-way expert parallelism inference on the prepared dataset. Please refer to the [LLM API MGMN example](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/llm_mgmn_trtllm_bench.sh) for details on running `trtllm-bench` on Slurm.
 
 ```bash
-cat > ./extra_llm_api_options.yaml <<EOF
+cat > ./config.yaml <<EOF
 enable_attention_dp: true
 EOF
 
@@ -551,7 +551,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 32 \
     --ep 32 \
-    --extra_llm_api_options ./extra_llm_api_options.yaml \
+    --config ./config.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --backend pytorch \
     --dataset ./dataset.json \
@@ -621,7 +621,7 @@ export EXPERT_STATISTIC_ITER_RANGE=100-200
 Run 36-way expert parallelism inference with the EPLB configuration incorporated:
 
 ```bash
-cat > ./extra_llm_api_options_eplb.yaml <<EOF
+cat > ./config_eplb.yaml <<EOF
 enable_attention_dp: true
 moe_config:
   load_balancer: ./moe_load_balancer.yaml
@@ -633,7 +633,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 36 \
     --ep 36 \
-    --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
+    --config ./config_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --backend pytorch \
     --dataset ./dataset.json \
diff --git a/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md b/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
index fb2bcdcee6..5ebb4e3cbb 100644
--- a/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
+++ b/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
@@ -73,7 +73,7 @@ docker run -d --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
         trtllm-serve /config/models/maverick \
             --host 0.0.0.0 --port 8000 \
             --tp_size 8 --ep_size 1 \
-            --trust_remote_code --extra_llm_api_options c.yaml \
+            --trust_remote_code --config c.yaml \
             --kv_cache_free_gpu_memory_fraction 0.75"
 ```
 
diff --git a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
index ffc7289ebb..28387081e3 100644
--- a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
+++ b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md
@@ -86,7 +86,7 @@ trtllm-bench \
     --backend pytorch \
     --tp ${num_gpus} \
     --ep 1 \
-    --extra_llm_api_options low_latency.yaml \
+    --config low_latency.yaml \
     --dataset gpt-oss-120b-1k2k.txt \
     --max_batch_size ${max_batch_size} \
     --concurrency ${max_batch_size} \
@@ -149,7 +149,7 @@ trtllm-bench \
     --backend pytorch \
     --tp ${num_gpus} \
     --ep ${num_gpus} \
-    --extra_llm_api_options max_throughput.yaml \
+    --config max_throughput.yaml \
     --dataset gpt-oss-120b-1k2k.txt \
     --max_batch_size ${max_batch_size} \
     --concurrency $((max_batch_size * num_gpus)) \
@@ -171,7 +171,7 @@ Currently, the best throughput **19.5k tps/gpu** is achieved with DP4EP4 using 4
 
 ## Launch the TensorRT-LLM Server
 
-We can use `trtllm-serve` to serve the model by translating the benchmark commands above. For low-latency configuration, run:  
+We can use `trtllm-serve` to serve the model by translating the benchmark commands above. For low-latency configuration, run:
 **Note:** You can also point to a local path containing the model weights instead of the HF repo (e.g., `${local_model_path}`).
 
 ```bash
@@ -184,7 +184,7 @@ trtllm-serve  openai/gpt-oss-120b \
   --ep_size 8 \
   --max_batch_size 640 \
   --trust_remote_code \
-  --extra_llm_api_options max_throughput.yaml \
+  --config max_throughput.yaml \
   --kv_cache_free_gpu_memory_fraction 0.9
 ```
 </details>
@@ -201,7 +201,7 @@ trtllm-serve \
   --ep_size 4 \
   --max_batch_size 640 \
   --trust_remote_code \
-  --extra_llm_api_options max_throughput.yaml \
+  --config max_throughput.yaml \
   --kv_cache_free_gpu_memory_fraction 0.9
 ```
 </details>
@@ -223,7 +223,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT LLM
 
 ### Selecting Triton as the MoE backend
 
-To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--extra_llm_api_options`:
+To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--config`:
 
 ```yaml
 moe_config:
@@ -347,7 +347,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM
 
 ### Selecting Triton as the MoE backend
 
-To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--extra_llm_api_options`:
+To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--config`:
 
 ```yaml
 moe_config:
diff --git a/docs/source/commands/trtllm-bench.rst b/docs/source/commands/trtllm-bench.rst
index cd69874e0c..fee60a9ab7 100644
--- a/docs/source/commands/trtllm-bench.rst
+++ b/docs/source/commands/trtllm-bench.rst
@@ -3,9 +3,12 @@ trtllm-bench
 
 trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It provides three main subcommands for different benchmarking scenarios:
 
-**Common Options for All Commands:**
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
 
-**Usage:**
+Syntax
+------
 
 .. click:: tensorrt_llm.commands.bench:main
    :prog: trtllm-bench
@@ -14,8 +17,11 @@ trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It p
 
 
 
+Dataset preparation
+------------------
+
 prepare_dataset.py
-===========================
+^^^^^^^^^^^^^^^^^^
 
 trtllm-bench is designed to work with the `prepare_dataset.py <https://github.com/NVIDIA/TensorRT-LLM/blob/main/benchmarks/cpp/prepare_dataset.py>`_ script, which generates benchmark datasets in the required format. The prepare_dataset script supports:
 
@@ -38,7 +44,7 @@ trtllm-bench is designed to work with the `prepare_dataset.py <https://github.co
 **Usage:**
 
 prepare_dataset
--------------------
+"""""""""""""""
 
 .. code-block:: bash
 
@@ -72,7 +78,7 @@ prepare_dataset
      - Logging level: info or debug (default: info)
 
 dataset
--------------------
+"""""""
 
 Process real datasets from various sources.
 
@@ -103,7 +109,7 @@ Process real datasets from various sources.
 
 
 token_norm_dist
--------------------
+"""""""""""""""
 
 Generate synthetic datasets with normal token distribution.
 
@@ -134,7 +140,7 @@ Generate synthetic datasets with normal token distribution.
 
 
 token_unif_dist
--------------------
+"""""""""""""""
 
 Generate synthetic datasets with uniform token distribution
 
diff --git a/docs/source/commands/trtllm-eval.rst b/docs/source/commands/trtllm-eval.rst
index 1f4cf62c56..e00d9fe0dc 100644
--- a/docs/source/commands/trtllm-eval.rst
+++ b/docs/source/commands/trtllm-eval.rst
@@ -79,6 +79,10 @@ Alternatively, the ``--model`` argument also accepts a local path to pre-built T
 
 For more details, see ``trtllm-eval --help`` and ``trtllm-eval <task> --help``.
 
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+
 
 
 Syntax
diff --git a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
index 34a509f5a4..089426d9b7 100644
--- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
+++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
@@ -3,30 +3,11 @@
 TensorRT LLM provides the OpenAI-compatible API via `trtllm-serve` command.
 A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).
 
-This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B and Qwen2.5-VL-7B for multimodal models:
- * Methodology Introduction
- * Launch the OpenAI-Compatible Server with NGC container
- * Run the performance benchmark
- * Using `extra_llm_api_options`
- * Multimodal Serving and Benchmarking
-
-## Table of Contents
-- [Run benchmarking with `trtllm-serve`](#run-benchmarking-with-trtllm-serve)
-  - [Table of Contents](#table-of-contents)
-  - [Methodology Introduction](#methodology-introduction)
-  - [Preparation](#preparation)
-    - [Launch the NGC container](#launch-the-ngc-container)
-    - [Start the trtllm-serve service](#start-the-trtllm-serve-service)
-  - [Benchmark using `tensorrt_llm.serve.scripts.benchmark_serving`](#benchmark-using-tensorrt_llmservescriptsbenchmark_serving)
-    - [Key Metrics](#key-metrics)
-  - [About `extra_llm_api_options`](#about-extra_llm_api_options)
-      - [`kv_cache_config`](#kv_cache_config)
-      - [`cuda_graph_config`](#cuda_graph_config)
-      - [`moe_config`](#moe_config)
-      - [`attention_backend`](#attention_backend)
-  - [Multimodal Serving and Benchmarking](#multimodal-serving-and-benchmarking)
-    - [Setting up Multimodal Serving](#setting-up-multimodal-serving)
-    - [Multimodal Benchmarking](#multimodal-benchmarking)
+```{contents}
+:Contents
+:local:
+:depth: 3
+```
 
 
 ## Methodology Introduction
@@ -57,9 +38,9 @@ For benchmarking purposes, first create a bash script using the following code a
 ```bash
 #! /bin/bash
 model_path=/path/to/llama3.1_70B
-extra_llm_api_file=/tmp/extra-llm-api-config.yml
+config_file=/tmp/config.yml
 
-cat << EOF > ${extra_llm_api_file}
+cat << EOF > ${config_file}
 enable_attention_dp: false
 print_iter_log: true
 cuda_graph_config:
@@ -77,7 +58,7 @@ trtllm-serve ${model_path} \
     --tp_size 1 \
     --ep_size 1 \
     --trust_remote_code \
-    --extra_llm_api_options ${extra_llm_api_file}
+    --config ${config_file}
 ```
 > [!NOTE]
 > The trtllm-llmapi-launch is a script that launches the LLM-API code on
@@ -215,17 +196,24 @@ $$
 
 To get more detailed metrics besides the key metrics above, there is an [experimental tool](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/serve/scripts/time_breakdown) for request time breakdown.
 
-## About `extra_llm_api_options`
-   trtllm-serve provides `extra_llm_api_options` knob to **overwrite** the parameters specified by trtllm-serve.
-   Generally, We create a YAML file that contains various performance switches.
-   e.g
-   ```yaml
-     cuda_graph_config:
-      padding_enabled: true
-     print_iter_log: true
-     kv_cache_dtype: fp8
-     enable_attention_dp: true
-   ```
+## About `--config`
+
+```{eval-rst}
+.. include:: ../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+`trtllm-serve` provides `--config` to **overwrite** the parameters specified by `trtllm-serve`.
+Generally, we create a YAML file that contains various performance switches. For example:
+
+```yaml
+cuda_graph_config:
+  padding_enabled: true
+print_iter_log: true
+kv_cache_dtype: fp8
+enable_attention_dp: true
+```
 
 The following is a list of common performance switches.
 #### `kv_cache_config`
@@ -274,7 +262,7 @@ The following is a list of common performance switches.
 
 &emsp;**Default**: TRTLLM
 
-See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.`
+See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `--config`.
 
 ## Multimodal Serving and Benchmarking
 
diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst
index 33bad7f1e5..7e09872a9b 100644
--- a/docs/source/commands/trtllm-serve/trtllm-serve.rst
+++ b/docs/source/commands/trtllm-serve/trtllm-serve.rst
@@ -98,7 +98,7 @@ First, create a configuration file:
 
 .. code-block:: bash
 
-   cat >./extra-llm-api-config.yml<<EOF
+   cat >./config.yml<<EOF
    kv_cache_config:
        enable_block_reuse: false
    EOF
@@ -108,7 +108,7 @@ Then, start the server with the configuration file:
 .. code-block:: bash
 
    trtllm-serve Qwen/Qwen2-VL-7B-Instruct \
-       --extra_llm_api_options ./extra-llm-api-config.yml
+       --config ./config.yml
 
 Multimodal Chat API
 ~~~~~~~~~~~~~~~~~~~
@@ -201,7 +201,7 @@ You can deploy `DeepSeek-V3 <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_ m
 
 .. code-block:: bash
 
-    echo -e "enable_attention_dp: true\npytorch_backend_config:\n  enable_overlap_scheduler: true" > extra-llm-api-config.yml
+    echo -e "enable_attention_dp: true\npytorch_backend_config:\n  enable_overlap_scheduler: true" > config.yml
 
     srun -N 2 -w [NODES] \
         --output=benchmark_2node.log \
@@ -210,7 +210,7 @@ You can deploy `DeepSeek-V3 <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_ m
         --container-image=<CONTAINER_IMG> \
         --container-mounts=/workspace:/workspace \
         --container-workdir /workspace \
-        bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
+        bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --config ./config.yml"
 
 See `the source code <https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/trtllm-llmapi-launch>`_ of ``trtllm-llmapi-launch`` for more details.
 
@@ -234,11 +234,11 @@ For the default PyTorch backend, iteration statistics logging is enabled by sett
    # extra_llm_config.yaml
    enable_iter_perf_stats: true
 
-Start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file:
+Start the server and specify the ``--config`` argument with the path to the YAML file:
 
 .. code-block:: bash
 
-   trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --extra_llm_api_options extra_llm_config.yaml
+   trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --config config.yaml
 
 After sending at least one inference request to the server, you can fetch runtime iteration statistics by polling the ``/metrics`` endpoint.
 Since the statistics are stored in an internal queue and removed once retrieved, it's recommended to poll the endpoint shortly after each request and store the results if needed.
@@ -272,10 +272,16 @@ Example output:
         }
     ]
 
+.. _configuring-with-yaml-files:
+
 Configuring with YAML Files
 ----------------------------
 
-You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--extra_llm_api_options`` option to the path of a YAML file, the arguments in the file will override the corresponding command line arguments.
+You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--config`` option to the path of a YAML file. The arguments in the file override the corresponding command line arguments.
+
+.. include:: ../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
 
 The yaml file is configuration of `tensorrt_llm.llmapi.LlmArgs <https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs>`_, the class has multiple levels of hierarchy, to configure the top level arguments like ``max_batch_size``, the yaml file should be like:
 
diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst
index c2e1e5b55d..bb59b7505f 100644
--- a/docs/source/deployment-guide/config_table.rst
+++ b/docs/source/deployment-guide/config_table.rst
@@ -1,4 +1,4 @@
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-traffic-patterns
    :end-before: .. end-note-traffic-patterns
 
@@ -25,121 +25,121 @@
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml``
 
 .. end-deepseek-ai/DeepSeek-R1-0528
 
@@ -166,169 +166,169 @@
      - 1024 / 1024
      - 4
      - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 16
      - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - Balanced
      - 1024 / 1024
      - 32
      - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 128
      - `1k1k_tp4_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 256
      - `1k1k_tp4_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 16
      - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - Balanced
      - 8192 / 1024
      - 32
      - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 128
      - `8k1k_tp4_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 256
      - `8k1k_tp4_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 128
      - `1k1k_tp8_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 256
      - `1k1k_tp8_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - Balanced
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 128
      - `8k1k_tp8_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 256
      - `8k1k_tp8_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml``
 
 .. end-nvidia/DeepSeek-R1-0528-FP4-v2
 
@@ -355,720 +355,720 @@
      - 1024 / 1024
      - 4
      - `1k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml``
    * - B200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml``
    * - B200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml``
    * - B200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml``
    * - B200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml``
    * - B200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml``
    * - B200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml``
    * - B200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml``
    * - B200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml``
    * - 2xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml``
    * - 2xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml``
    * - 2xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml``
    * - 2xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml``
    * - 2xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml``
    * - 2xB200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml``
    * - 2xB200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml``
    * - 2xB200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml``
    * - 2xB200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml``
    * - 2xB200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml``
    * - 2xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml``
    * - 2xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml``
    * - 2xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml``
    * - 2xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml``
    * - 2xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml``
    * - 4xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml``
    * - 4xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml``
    * - 8xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml``
    * - 8xB200_NVL
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml``
    * - H200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml``
    * - H200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml``
    * - H200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml``
    * - H200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml``
    * - H200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml``
    * - H200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml``
    * - H200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml``
    * - H200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml``
    * - H200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml``
    * - H200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml``
    * - H200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml``
    * - H200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml``
    * - H200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml``
    * - H200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml``
    * - H200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml``
    * - 2xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml``
    * - 2xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml``
    * - 2xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml``
    * - 2xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml``
    * - 2xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml``
    * - 2xH200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml``
    * - 2xH200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml``
    * - 2xH200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml``
    * - 2xH200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml``
    * - 2xH200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml``
    * - 2xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml``
    * - 2xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml``
    * - 2xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml``
    * - 2xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml``
    * - 2xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml``
    * - 4xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml``
    * - 4xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml``
    * - 4xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml``
    * - 4xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml``
    * - 4xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml``
    * - 4xH200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml``
    * - 4xH200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml``
    * - 4xH200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml``
    * - 4xH200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml``
    * - 4xH200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml``
    * - 4xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml``
    * - 4xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml``
    * - 4xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml``
    * - 4xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml``
    * - 4xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
      - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml``
    * - 8xH200_SXM
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml``
    * - 8xH200_SXM
      - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml``
 
 .. end-openai/gpt-oss-120b
diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md
index e4165eac09..881f86eb12 100644
--- a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md
@@ -115,7 +115,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -124,7 +124,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 
 #### `tensor_parallel_size`
@@ -200,7 +200,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
 
 * **Default**: `TRTLLM`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ### Wide Expert Parallelism
 
@@ -435,7 +435,7 @@ $$
 The following tables list recommended configurations from the comprehensive database for different performance profiles.
 
 ```{eval-rst}
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-traffic-patterns
    :end-before: .. end-note-traffic-patterns
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md
index 5a9f9f4c72..d28f3fa9f3 100644
--- a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md
@@ -113,7 +113,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -122,7 +122,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -178,7 +178,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
   * `backend`: The backend to use for MoE operations.
     **Default**: `CUTLASS`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
@@ -383,7 +383,7 @@ $$
 The following table lists recommended configurations from the comprehensive database for different performance profiles.
 
 ```{eval-rst}
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-traffic-patterns
    :end-before: .. end-note-traffic-patterns
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md
index 391a72091d..8ae2dac147 100644
--- a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md
@@ -60,7 +60,7 @@ With the `EXTRA_OPTIONS_YAML_FILE`, use the following example command to launch
 ```bash
 trtllm-serve nvidia/Kimi-K2-Thinking-NVFP4 \
     --host 0.0.0.0 --port 8000 \
-    --extra_llm_api_options ${EXTRA_OPTIONS_YAML_FILE}
+    --config ${EXTRA_OPTIONS_YAML_FILE}
 ```
 
 TensorRT LLM will load weights and select the best kernels during startup. The server is successfully launched when the following log is shown:
diff --git a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md
index d3e328d810..f58405e8be 100644
--- a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md
@@ -83,7 +83,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -92,7 +92,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -170,7 +170,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
 
 &emsp;**Default**: TRTLLM
 
-See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
+See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md
index 7d69b7a8be..d279ab3716 100644
--- a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md
@@ -82,7 +82,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section.
 
 ```shell
-trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -91,7 +91,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -166,7 +166,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
 
 * **Default**: `TRTLLM`
 
-See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
+See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md
index 46bf724b71..3ff4432d1b 100644
--- a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md
@@ -61,7 +61,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Qwen3-Next model from within the container.
 
 ```shell
-trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -70,7 +70,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -127,7 +127,7 @@ These options provide control over TensorRT LLM's behavior and are set within th
   * `backend`: The backend to use for MoE operations.
     **Default**: `CUTLASS`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
@@ -220,7 +220,7 @@ If you want to save the results to a file add the following options.
 --result-filename "concurrency_${concurrency}.json"
 ```
 
-For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) 
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
 
 Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
 
diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md
index 894c6a1e63..bda3e1a4c4 100644
--- a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md
+++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md
@@ -66,7 +66,7 @@ append: EOF
 Below is an example command to launch the TensorRT LLM server with the Qwen3 model from within the container.
 
 ```shell
-trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 After the server is set up, the client can now send prompt requests to the server and receive results.
@@ -75,7 +75,7 @@ After the server is set up, the client can now send prompt requests to the serve
 
 <!-- TODO: this section is duplicated across the deployment guides; they should be consolidated to a central file and imported as needed, or we can remove this and link to LLM API reference -->
 
-These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
+These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument.
 
 #### `tensor_parallel_size`
 
@@ -127,10 +127,10 @@ These options provide control over TensorRT LLM's behavior and are set within th
 * **Options**:
 
   * `backend`: The backend to use for MoE operations.
-    
+
     **Default**: `CUTLASS`
 
-See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
+See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file.
 
 ## Testing API Endpoint
 
@@ -247,7 +247,7 @@ If you want to save the results to a file add the following options.
 --result-filename "concurrency_${concurrency}.json"
 ```
 
-For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) 
+For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
 
 Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
 
diff --git a/docs/source/deployment-guide/index.rst b/docs/source/deployment-guide/index.rst
index 644a9d9ae9..1d2df5e5b6 100644
--- a/docs/source/deployment-guide/index.rst
+++ b/docs/source/deployment-guide/index.rst
@@ -17,7 +17,7 @@ The TensorRT LLM Docker container makes these config files available at ``/app/t
 
    export TRTLLM_DIR="/app/tensorrt_llm" # path to the TensorRT LLM repo in your local environment
 
-.. include:: note_sections.rst
+.. include:: ../_includes/note_sections.rst
    :start-after: .. start-note-quick-start-isl-osl
    :end-before: .. end-note-quick-start-isl-osl
 
@@ -36,52 +36,52 @@ This table is designed to provide a straightforward starting point; for detailed
      - H100, H200
      - Max Throughput
      - `deepseek-r1-throughput.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-throughput.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
    * - `DeepSeek-R1 <https://huggingface.co/deepseek-ai/DeepSeek-R1-0528>`_
      - B200, GB200
      - Max Throughput
      - `deepseek-r1-deepgemm.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-deepgemm.yaml>`_
-     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml``
+     - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml``
    * - `DeepSeek-R1 (NVFP4) <https://huggingface.co/nvidia/DeepSeek-R1-FP4>`_
      - B200, GB200
      - Max Throughput
      - `deepseek-r1-throughput.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-throughput.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml``
    * - `DeepSeek-R1 (NVFP4) <https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2>`_
      - B200, GB200
      - Min Latency
      - `deepseek-r1-latency.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/deepseek-r1-latency.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml``
+     - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml``
    * - `gpt-oss-120b <https://huggingface.co/openai/gpt-oss-120b>`_
      - Any
      - Max Throughput
      - `gpt-oss-120b-throughput.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/gpt-oss-120b-throughput.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml``
    * - `gpt-oss-120b <https://huggingface.co/openai/gpt-oss-120b>`_
      - Any
      - Min Latency
      - `gpt-oss-120b-latency.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/gpt-oss-120b-latency.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml``
+     - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml``
    * - `Qwen3-Next-80B-A3B-Thinking <https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Thinking>`_
      - Any
      - Max Throughput
      - `qwen3-next.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/qwen3-next.yaml>`_
-     - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml``
+     - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --config ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml``
    * - Qwen3 family (e.g. `Qwen3-30B-A3B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_)
      - Any
      - Max Throughput
      - `qwen3.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/qwen3.yaml>`_
-     - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed)
+     - ``trtllm-serve Qwen/Qwen3-30B-A3B --config ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed)
    * - `Llama-3.3-70B (FP8) <https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8>`_
      - Any
      - Max Throughput
      - `llama-3.3-70b.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/llama-3.3-70b.yaml>`_
-     - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml``
+     - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml``
    * - `Llama 4 Scout (FP8) <https://huggingface.co/nvidia/Llama-4-Scout-17B-16E-Instruct-FP8>`_
      - Any
      - Max Throughput
      - `llama-4-scout.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/curated/llama-4-scout.yaml>`_
-     - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml``
+     - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml``
 
 Model-Specific Deployment Guides
 ---------------------------------
diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md
index ab6feab7e3..e95e28c496 100644
--- a/docs/source/developer-guide/perf-benchmarking.md
+++ b/docs/source/developer-guide/perf-benchmarking.md
@@ -2,6 +2,13 @@
 
 # TensorRT LLM Benchmarking
 
+
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
 TensorRT LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility that aims to make it
 easier for users to reproduce our officially published [performance overview](./perf-overview.md#throughput-measurements). `trtllm-bench` provides the follows:
 
@@ -176,7 +183,7 @@ trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synth
 
 To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. The `throughput` benchmark initializes the backend by tuning against the dataset provided via `--dataset` (or the other build mode settings described above).
 
-Note that CUDA graph is enabled by default. You can add additional pytorch config with `--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`.
+Note that CUDA graph is enabled by default. You can add additional pytorch config with `--config` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`.
 
 ```{tip}
 The command below specifies the `--model_path` option. The model path is optional and used only when you want to run a locally
@@ -289,7 +296,7 @@ The generated dataset will include LoRA request metadata. Below is an example of
 
 **LoRA Configuration**
 
-Create an `extra-llm-api-options.yaml` file with LoRA configuration:
+Create a `config.yaml` file with LoRA configuration:
 
 ```yaml
 lora_config:
@@ -314,7 +321,7 @@ trtllm-bench --model /path/to/base/model \
   throughput \
   --dataset synthetic_lora_data.json \
   --backend pytorch \
-  --extra_llm_api_options extra-llm-api-options.yaml
+  --config config.yaml
 ```
 
 ```{note}
diff --git a/docs/source/developer-guide/perf-overview.md b/docs/source/developer-guide/perf-overview.md
index ae3a0072e9..8602ff1896 100644
--- a/docs/source/developer-guide/perf-overview.md
+++ b/docs/source/developer-guide/perf-overview.md
@@ -269,7 +269,7 @@ Testing was performed using the PyTorch backend - this workflow does not require
 | Stage | Description | Command |
 | :- | - | - |
 | [Dataset](#preparing-a-dataset) | Create a synthetic dataset | `python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file` |
-| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options` |
+| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options` |
 
 ### Variables
 
@@ -323,7 +323,7 @@ a model name (HuggingFace reference or path to a local model), a [generated data
 
 For dense / non-MoE models:
 ```shell
-trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
+trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options
 ```
 Llama 3.3
 
@@ -337,7 +337,7 @@ cuda_graph_config:
 For MoE models:
 
 ```shell
-trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
+trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options
 ```
 
 GPT-OSS:
diff --git a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
index d5e0cde8f2..84f8015889 100644
--- a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
+++ b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
@@ -24,7 +24,13 @@ As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench
 
 ## Advanced Configuration
 
-For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file:
+For more granular control over AutoDeploy's behavior during benchmarking, use the `--config` flag with a YAML configuration file:
+
+```{eval-rst}
+.. include:: ../../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
 
 ```bash
 trtllm-bench \
@@ -32,7 +38,7 @@ trtllm-bench \
   throughput \
   --dataset /tmp/synthetic_128_128.txt \
   --backend _autodeploy \
-  --extra_llm_api_options autodeploy_config.yaml
+  --config autodeploy_config.yaml
 ```
 
 ### Configuration Examples
diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md
index ce52b9a3d5..b6eb4b17f9 100644
--- a/docs/source/features/disagg-serving.md
+++ b/docs/source/features/disagg-serving.md
@@ -1,4 +1,4 @@
-# Disaggregated Serving 
+# Disaggregated Serving
 
 - [Motivation](#Motivation)
 - [KV Cache Exchange](#KV-Cache-Exchange)
@@ -100,6 +100,12 @@ For more information on how to use Dynamo with TensorRT-LLM, please refer to [th
 
 The second approach to evaluate disaggregated LLM inference with TensorRT LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 6 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 6). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request.
 
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
 <div align="center">
 <figure>
   <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture3.png" width="800" height="auto">
@@ -126,19 +132,19 @@ For example, you could launch two context servers and one generation servers as
 
 ```
 
-# Generate context_extra-llm-api-config.yml
+# Generate context_config.yml
 # Overlap scheduler for context servers are disabled because it's not supported for disaggregated context servers yet
-echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml
+echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > context_config.yml
 
 # Start Context servers
-CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 &
-CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 &
+CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --config ./context_config.yml &> log_ctx_0 &
+CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --config ./context_config.yml &> log_ctx_1 &
 
-# Generate gen_extra-llm-api-config.yml
-echo -e "cache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml
+# Generate gen_config.yml
+echo -e "cache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > gen_config.yml
 
 # Start Generation servers
-CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --extra_llm_api_options ./gen_extra-llm-api-config.yml &> log_gen_0 &
+CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --config ./gen_config.yml &> log_gen_0 &
 ```
 Once the context and generation servers are launched, you can launch the disaggregated
 server, which will accept requests from clients and do the orchestration between context
diff --git a/docs/source/features/guided-decoding.md b/docs/source/features/guided-decoding.md
index 110efc8e51..3591d1808f 100644
--- a/docs/source/features/guided-decoding.md
+++ b/docs/source/features/guided-decoding.md
@@ -9,14 +9,20 @@ TensorRT LLM supports two grammar backends:
 
 ## Online API: `trtllm-serve`
 
-If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--extra_llm_api_options`. For example,
+If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--config`. For example,
+
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
 
 ```bash
-cat > extra_llm_api_options.yaml <<EOF
+cat > config.yaml <<EOF
 guided_decoding_backend: xgrammar
 EOF
 
-trtllm-serve nvidia/Llama-3.1-8B-Instruct-FP8 --extra_llm_api_options extra_llm_api_options.yaml
+trtllm-serve nvidia/Llama-3.1-8B-Instruct-FP8 --config config.yaml
 ```
 
 You should see a log like the following, which indicates the grammar backend is successfully enabled.
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index dae13c50a9..0525648225 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -157,7 +157,13 @@ llm = LLM(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -168,7 +174,7 @@ lora_config:
 ```bash
 python -m tensorrt_llm.commands.serve
      /path/to/model \
-    --extra_llm_api_options extra_llm_api_options.yaml
+    --config config.yaml
 ```
 
 ### Client Usage
@@ -196,7 +202,13 @@ response = client.completions.create(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -216,5 +228,5 @@ lora_config:
 ```
 ### Run trtllm-bench
 ```bash
-trtllm-bench --model $model_path throughput --dataset $dataset_path --extra_llm_api_options extra_llm_api_options.yaml --num_requests 64 --concurrency 16
+trtllm-bench --model $model_path throughput --dataset $dataset_path --config config.yaml --num_requests 64 --concurrency 16
 ```
diff --git a/docs/source/features/parallel-strategy.md b/docs/source/features/parallel-strategy.md
index 64b2b051be..caed59c367 100644
--- a/docs/source/features/parallel-strategy.md
+++ b/docs/source/features/parallel-strategy.md
@@ -80,7 +80,7 @@ enable_attention_dp: true
 EOF
 ```
 
-then set `--extra_llm_api_options parallel_config.yaml` in `trtllm-serve` or `trtllm-bench`.
+then set `--config parallel_config.yaml` in `trtllm-serve` or `trtllm-bench`.
 
 ### FFN Module
 
diff --git a/docs/source/features/speculative-decoding.md b/docs/source/features/speculative-decoding.md
index 48b70e6851..089d7ecf3a 100644
--- a/docs/source/features/speculative-decoding.md
+++ b/docs/source/features/speculative-decoding.md
@@ -122,7 +122,13 @@ llm = LLM("/path/to/target_model", speculative_config=speculative_config)
 
 ## Usage with `trtllm-bench` and `trtllm-serve`
 
-Speculative decoding options must be specified via `--extra_llm_api_options config.yaml` for both `trtllm-bench` and `trtllm-serve`. All speculative decoding options can be specified in this YAML file. An additional `decoding_type` option is used to specify the type of speculation to use. The available options are:
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
+Speculative decoding options must be specified via `--config config.yaml` for both `trtllm-bench` and `trtllm-serve`. All speculative decoding options can be specified in this YAML file. An additional `decoding_type` option is used to specify the type of speculation to use. The available options are:
 
 * `MTP`
 * `Eagle` (for EAGLE 3)
diff --git a/docs/source/features/torch_compile_and_piecewise_cuda_graph.md b/docs/source/features/torch_compile_and_piecewise_cuda_graph.md
index 786ab39b51..5fab5e09d0 100644
--- a/docs/source/features/torch_compile_and_piecewise_cuda_graph.md
+++ b/docs/source/features/torch_compile_and_piecewise_cuda_graph.md
@@ -31,7 +31,13 @@ Piecewise CUDA Graph is a technique that runs cudagraph-unsupported components (
 
 ## Usage
 
-To enable torch.compile and Piecewise CUDA Graph, add the following configuration to `extra_config.yml`. Typically, the `extra_config.yml` can be used by adding launching args `--extra_llm_api_options extra_config.yml` to `trtllm-serve` or `trtllm-bench`.
+To enable torch.compile and Piecewise CUDA Graph, add the following configuration to `config.yml`. Typically, the `config.yml` can be used by adding launching args `--config config.yml` to `trtllm-serve` or `trtllm-bench`.
+
+```{eval-rst}
+.. include:: ../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
 
 ```yaml
 ... # Other extra config
@@ -50,7 +56,7 @@ Piecewise CUDA Graph only handles context-only and mixed context+generation iter
 ```yaml
 cuda_graph_config:
   enable_padding: true
-  max_batch_size: 1024 # Specify max capture batch size for generation only cuda graph. By default, TensorRT LLM will generate a capture list based on it. 
+  max_batch_size: 1024 # Specify max capture batch size for generation only cuda graph. By default, TensorRT LLM will generate a capture list based on it.
 
 torch_compile_config:
   capture_num_tokens: '${capture_num_tokens}' # Specify capture_num_tokens for piecewise cuda graph
@@ -72,7 +78,7 @@ Guidelines for `capture_num_tokens`:
 
 - Define bounds:
   - Lower bound: base it on typical context lengths. In low-latency workflows with KV-cache reuse, it can be as small as <10 tokens.
-  - Upper bound: set by hardware and model configuration—choose the largest token count that still provides a measurable benefit from Piecewise CUDA Graph even after padding. 
+  - Upper bound: set by hardware and model configuration—choose the largest token count that still provides a measurable benefit from Piecewise CUDA Graph even after padding.
 - Choose step size: Choose step sizes that balance coverage and memory overhead. Use denser steps in a smaller number of token ranges, and a fixed step (e.g., 256) for larger ranges.
 - Manage trade-offs: more capture points reduce padding but increase memory use and can lower max concurrency; fewer points save memory but increase padding and compute cost.
 
@@ -80,7 +86,7 @@ Even with Piecewise CUDA Graph enabled, you may still observe bubbles in the con
 
 ## Known Issue
 
-Torch compile cannot work with multi-ModelEngine config. 
+Torch compile cannot work with multi-ModelEngine config.
 
 1. Speculative Decoding in Two-Model Style
 
@@ -104,14 +110,14 @@ Currently, TRT-LLM mainly relies on torch.compile **fullgraph** mode to enable P
 
 #### Custom Op
 
-For ops that cannot be represented by a torch native op, developers need to wrap them into a custom op so that they can work properly with torch.compile. A custom op mainly contains two parts: Op forward implementation & Fake kernel. 
+For ops that cannot be represented by a torch native op, developers need to wrap them into a custom op so that they can work properly with torch.compile. A custom op mainly contains two parts: Op forward implementation & Fake kernel.
 
-1. Op forward implementation: Define how this op does forward calculation. Including custom CUDA kernel, etc. 
+1. Op forward implementation: Define how this op does forward calculation. Including custom CUDA kernel, etc.
 2. Fake kernel: Help torch.compile to do the output tensor dtype/shape inference.
 
-After wrapping the op into a torch custom op, the implementation is a completely **black box** for torch compile. Instead, torch.compile will fully rely on a fake kernel to do the tracing. 
+After wrapping the op into a torch custom op, the implementation is a completely **black box** for torch compile. Instead, torch.compile will fully rely on a fake kernel to do the tracing.
 
-Below is a simple example of flashinfer op’s fake kernel. 
+Below is a simple example of flashinfer op’s fake kernel.
 
 ```python
 @torch.library.custom_op("trtllm::flashinfer_silu_and_mul", mutates_args=())
@@ -127,9 +133,9 @@ For more examples, please refer to `tensorrt_llm/_torch/custom_ops`.
 
 #### Current Status
 
-For hot models like deepseek/qwen/lllama, we’ve already wrapped some large modules into a custom op to avoid trace failure/graph breaks and exclude output projection & MTP from torch.compile's scope. 
+For hot models like deepseek/qwen/lllama, we’ve already wrapped some large modules into a custom op to avoid trace failure/graph breaks and exclude output projection & MTP from torch.compile's scope.
 
-This means developing the inside attention custom op part, the MoE routed export part, and the MPT part don’t need to worry about complex torch.compile constraints since they are treated as a black box for Torch compile. Developers should only make sure the fake kernels of attention custom op, and routed expert are aligned with the actual implementation. 
+This means developing the inside attention custom op part, the MoE routed export part, and the MPT part don’t need to worry about complex torch.compile constraints since they are treated as a black box for Torch compile. Developers should only make sure the fake kernels of attention custom op, and routed expert are aligned with the actual implementation.
 
 
 <div align="center">
@@ -158,21 +164,21 @@ For the op outside of attention and MLP, the developer should obey the torch.com
 </div>
 <p align="center"><sub><em>Figure 2. TensorRT LLM Custom torch.compile Backend Overview</em></sub></p>
 
-Above is the overview of the TensorRT LLM custom backend for `torch.compile`. 
+Above is the overview of the TensorRT LLM custom backend for `torch.compile`.
 
 #### Torch IR Optimization
 
 Torch IR is the Fx graph that is directly traced by Torch Dynamo. It has several important features for us to do some graph rewriting and get information:
 
 1. Preserve the operations as is: We can easily find a specific operation and then transform it to arbitrary operations. No need to deal with `auto_functionalize`, etc.
-2. Preserve original variable tensor name in the Fx graph: For Piecewise CUDA Graph, it needs to find the correct `SymInt` which represents the token number. Hence, we rely on the `input_ids`'s shape to make it find the `SymInt` correctly. 
+2. Preserve original variable tensor name in the Fx graph: For Piecewise CUDA Graph, it needs to find the correct `SymInt` which represents the token number. Hence, we rely on the `input_ids`'s shape to make it find the `SymInt` correctly.
 
 #### ATen IR Optimization
 
 We get ATen IR after explicitly calling `aot_module_simplified` on the Fx graph. ATen IR is
 
 1. In SSA format (no input mutations)
-2. Strict subset of aten op (<250): In Torch IR, Python native add op, `torch.Tensor().add()`, `torch.aten.add.Tensor` could be three different ops. After the transform, they will be the same op. 
+2. Strict subset of aten op (<250): In Torch IR, Python native add op, `torch.Tensor().add()`, `torch.aten.add.Tensor` could be three different ops. After the transform, they will be the same op.
 3. Guaranteed metadata information, e.g., dtype and shape propagation
 
 On this IR level, TensorRT LLM will do the following optimization
@@ -183,16 +189,16 @@ All fusions are located in `tensorrt_llm/_torch/compilation/patterns` and implem
 
 1. Inadequate handling of scalars and lists:
    - Scalars get specialized into the traced pattern, forcing one pattern per value—impractical and non-general.
-   - Lists are flattened, turning elements into separate input arguments, making it impossible to match the original operation. 
+   - Lists are flattened, turning elements into separate input arguments, making it impossible to match the original operation.
 2. Trace-driven pitfalls: Because it’s trace-based, the generated source patterns may not meet our needs and can introduce additional issues as we expand pattern coverage.
 
 We mainly do the operation fusion for AllReduce & RMSNorm.
 
 1. AllReduce related fusion: Fuse the following operations into one AllReduce op.
    + AllReduce + Residual + RMSNorm
-   + AllReduce + Residual + RMSNorm + FP8 Quantization 
+   + AllReduce + Residual + RMSNorm + FP8 Quantization
    + AllReduce + Residual + RMSNorm + FP4 Quantization
-2. AllReduce with User Buffer: Converts AllReduce operations to use userbuffers to avoid extra copy overhead. 
+2. AllReduce with User Buffer: Converts AllReduce operations to use userbuffers to avoid extra copy overhead.
 
 We enable these fusions in torch.compile because they’re difficult to express in eager mode. For the AllReduce + RMSNorm fusion, which is cross-module, implementing it in eager mode would require moving code between modules, leading to redundant, complex, and hard-to-maintain logic.
 
@@ -204,7 +210,7 @@ Because ATen IR is SSA, in-place operations are rewritten as out-of-place via a
 
 ##### Auto Multi-stream
 
-Currently torch.compile won't create a subgraph for user user-defined CUDA stream. Instead, it will convert it to `set_stream`. The set_stream op doesn't have any consumers, so it will be removed in the Torch IR to ATen IR transformation, thus losing all the multi-stream scheduling. 
+Currently torch.compile won't create a subgraph for user user-defined CUDA stream. Instead, it will convert it to `set_stream`. The set_stream op doesn't have any consumers, so it will be removed in the Torch IR to ATen IR transformation, thus losing all the multi-stream scheduling.
 
 To address this, we implemented an auto multi-stream scheduler:
 
@@ -214,7 +220,7 @@ To address this, we implemented an auto multi-stream scheduler:
 
 3. Schedules nodes onto up to `max_num_streams` specified by user config
 
-4. Insert multi-stream related custom op: since the Fx graph executes operators in list order, so we insert streaming-control operators directly into the graph. Moreover, as these operators have no users, we cannot perform dead-code elimination after multi-stream scheduling. Below is an example of multi-stream, which `trtllm.dsv3_router_gemm_op.default` and `trtllm.silu_and_mul.default` + `trtllm.fp4_quantize.default` execute in parallel. 
+4. Insert multi-stream related custom op: since the Fx graph executes operators in list order, so we insert streaming-control operators directly into the graph. Moreover, as these operators have no users, we cannot perform dead-code elimination after multi-stream scheduling. Below is an example of multi-stream, which `trtllm.dsv3_router_gemm_op.default` and `trtllm.silu_and_mul.default` + `trtllm.fp4_quantize.default` execute in parallel.
 
    ```
    call_function  record_event                             trtllm.record_event                          (1,)                                                                                   {}
@@ -238,7 +244,7 @@ To address this, we implemented an auto multi-stream scheduler:
    call_function  record_stream_1                          trtllm.record_stream                         (mm_1, 1)                                                                              {}
    call_function  record_event_4                           trtllm.record_event                          (2,)                                                                                   {}
    call_function  set_stream_1                             trtllm.set_stream                            (0,)                                                                                   {}
-   call_function  wait_event_2                             trtllm.wait_event                            (2,)                        
+   call_function  wait_event_2                             trtllm.wait_event                            (2,)
    ```
 
 #### Piecewise CUDA Graph
@@ -254,14 +260,14 @@ In the current design, we assume the attention block is the only non-capturable
 
 Notes:
 
-1. Attention **MUST NOT** have any output. The output tensor should be allocated by CUDA Graph. 
-2. Each sub-cudagraph **MUST** have at least one input tensor that contains the number of tokens in the shape. 
-3. Only allow dynamic shape for `num_of_tokens` dim. 
+1. Attention **MUST NOT** have any output. The output tensor should be allocated by CUDA Graph.
+2. Each sub-cudagraph **MUST** have at least one input tensor that contains the number of tokens in the shape.
+3. Only allow dynamic shape for `num_of_tokens` dim.
 
 ### Common Trace Failure
 
 1. Custom op fake kernel: For every custom op, developers must implement a correct fake kernel. **Make sure to update the corresponding fake kernel when the custom op is changed**
-2. Dynamic Iteration Number Loop: This is technically not a trace failure, but it will introduce long-time tracing that is generally not acceptable. When torch.compile tries to convert PyTorch modeling code to Fx graph, it will try to unroll the loop. For a loop that has a large and dynamic loop number with a large loop body, the tracing process will take a long time to do the unrolling. 
+2. Dynamic Iteration Number Loop: This is technically not a trace failure, but it will introduce long-time tracing that is generally not acceptable. When torch.compile tries to convert PyTorch modeling code to Fx graph, it will try to unroll the loop. For a loop that has a large and dynamic loop number with a large loop body, the tracing process will take a long time to do the unrolling.
    1. If the IO of the loop can be easily written into a custom op format, try to replace it with a custom op
    2. If the loop num is unchanged during the whole inference service lifetime, then it is ok to leave the loop as is. (e.g., Model decoder layer loop)
 
@@ -276,30 +282,30 @@ Notes:
      + `torch.nonzeros()`: Produce data-dependent dynamic shape tensor
      + `torch.sym_min`: `SymInt` aware min
      + `torch.Tensor.tolist()`, `torch.Tensor.item()`
-     + **Solution:** Use them inside a custom op if these operators don't get involved in producing the custom op's output tensor. 
+     + **Solution:** Use them inside a custom op if these operators don't get involved in producing the custom op's output tensor.
 
-2. Use a custom object’s method: For a class like mapping config, we cannot directly use its method like has_pp() in the model forward. 
+2. Use a custom object’s method: For a class like mapping config, we cannot directly use its method like has_pp() in the model forward.
 
-   + **Solution**: We should convert it to a bool in the model init and use the bool. 
+   + **Solution**: We should convert it to a bool in the model init and use the bool.
 
    ```python
    class Mapping(object):
        def __init__(self, ...):
            ...
-         
+
        def has_pp(self): # Cannot use this method in torch.compile
            return self.pp_size > 1
    ```
 
 3. Data Dependent Control(DDC) flow involved in code
 
-   + **Solution**: Try to avoid DDC in the code. Try to pre-compute the result outside of torch.compile's scope. For the following example, try to pre-compute the `torch.sum(data)` at the data preparation stage, and pass the result to the `forward`. 
+   + **Solution**: Try to avoid DDC in the code. Try to pre-compute the result outside of torch.compile's scope. For the following example, try to pre-compute the `torch.sum(data)` at the data preparation stage, and pass the result to the `forward`.
 
    ```python
    class TestCase(torch.nn.Module):
        def __init__(self):
            super().__init__()
-   
+
     def forward(self, x, data):
         y = x ** 2
         if torch.sum(data) >= 4: # Data Dependent Control Here!
@@ -308,7 +314,7 @@ Notes:
             t = y / 2
         t = t + 10
         return t
-   
+
    test_case = TestCase()
    test_case = torch.compile(test_case, backend=Backend())
    x = torch.randn(5).cuda()
@@ -320,15 +326,15 @@ Notes:
 
 ### Recompilation
 
-1. Try not to use data-dependent dynamic shapes in the model forward. (e.g., slice the tensor based on input value). This will introduce 0/1 specialization to the model and will possibly introduce recompile. 
+1. Try not to use data-dependent dynamic shapes in the model forward. (e.g., slice the tensor based on input value). This will introduce 0/1 specialization to the model and will possibly introduce recompile.
 
    1. **0/1 specialization**: torch.compile will recompile the model if a dynamic tensor’s dim equals 0 or 1. In the worst case, it will recompile 3 times for 1 dimension: 0,1, >2
 
-2. For an int argument that would change during runtime, use `SymInt` rather than int in the C++ custom op definition. Otherwise, it will trigger a recompile when the value changes. 
+2. For an int argument that would change during runtime, use `SymInt` rather than int in the C++ custom op definition. Otherwise, it will trigger a recompile when the value changes.
 
    ```c++
    TORCH_LIBRARY_FRAGMENT(trtllm, m)
-   {    
+   {
        m.def("allgather(Tensor input, SymInt[]? sizes, int[] group) -> Tensor");
        m.def("allgather_list(Tensor[] input_list, SymInt[]? sizes, int[] group) -> Tensor[]");
    }
@@ -340,13 +346,13 @@ Notes:
 
    2. Control Flow based on dynamic shape
 
-   3. Next power of two: Previously, we used `bit_length()` to implement the next power of 2 function. However, it will cause a recompile for every int value. Now rewrite the code to be torch.compile-friendly. 
+   3. Next power of two: Previously, we used `bit_length()` to implement the next power of 2 function. However, it will cause a recompile for every int value. Now rewrite the code to be torch.compile-friendly.
 
       ```python
       def next_positive_power_of_2(x: int) -> int:
           if x < 1:
               return 1
-      
+
           # Following code is equivalent to 1 << (x - 1).bit_length()
           # But this impl does not contain bit_length(), so it can be used by torch compile.
           # It can correctly handle 64-bit numbers, which should be enough for now.
@@ -359,5 +365,3 @@ Notes:
           n |= n >> 32
           return n + 1
       ```
-
-      
diff --git a/docs/source/helper.py b/docs/source/helper.py
index 675bd697e9..9f6530e166 100644
--- a/docs/source/helper.py
+++ b/docs/source/helper.py
@@ -358,15 +358,20 @@ def update_version():
     docs_source_dir = Path(__file__).parent.resolve()
     md_files = list(docs_source_dir.rglob("*.md"))
 
+    # Default is to replace `release:x.y.z` placeholders; set to 0 to disable.
+    if os.environ.get("TRTLLM_DOCS_REPLACE_CONTAINER_TAG", "1") != "1":
+        return
+
     for file_path in md_files:
         with open(file_path, "r") as f:
             content = f.read()
-        content = content.replace(
+        updated = content.replace(
             "nvcr.io/nvidia/tensorrt-llm/release:x.y.z",
             f"nvcr.io/nvidia/tensorrt-llm/release:{version}",
         )
-        with open(file_path, "w") as f:
-            f.write(content)
+        if updated != content:
+            with open(file_path, "w") as f:
+                f.write(updated)
 
 
 if __name__ == "__main__":
diff --git a/docs/source/legacy/performance/perf-benchmarking.md b/docs/source/legacy/performance/perf-benchmarking.md
index 9530b6da1b..caca11a7a4 100644
--- a/docs/source/legacy/performance/perf-benchmarking.md
+++ b/docs/source/legacy/performance/perf-benchmarking.md
@@ -415,11 +415,17 @@ Total Latency (ms):             13525.6862
 
 ### Running with the PyTorch Workflow
 
+```{eval-rst}
+.. include:: ../../_includes/note_sections.rst
+   :start-after: .. start-note-config-flag-alias
+   :end-before: .. end-note-config-flag-alias
+```
+
 To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. With the PyTorch flow, you will not need to
 run `trtllm-bench build`; the `throughput` benchmark initializes the backend by tuning against the
 dataset provided via `--dataset` (or the other build mode settings described [above](#other-build-modes)).
 Note that CUDA graph is enabled by default. You can add additional pytorch config with
-`--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the
+`--config` followed by the path to a YAML file. For more details, please refer to the
 help text by running the command with `--help`.
 
 ```{tip}
@@ -511,7 +517,7 @@ The generated dataset will include LoRA request metadata. Below is an example of
 
 **LoRA Configuration**
 
-Create an `extra-llm-api-options.yaml` file with LoRA configuration:
+Create a `config.yaml` file with LoRA configuration:
 
 ```yaml
 lora_config:
@@ -535,7 +541,7 @@ lora_config:
 trtllm-bench --model /path/to/base/model \
   throughput \
   --dataset synthetic_lora_data.json \
-  --extra_llm_api_options extra-llm-api-options.yaml
+  --config config.yaml
 ```
 
 ```{note}
diff --git a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
index 43e2a1a46e..2f37c716cf 100644
--- a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
+++ b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
@@ -24,7 +24,7 @@ As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench
 
 ## Advanced Configuration
 
-For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file:
+For more granular control over AutoDeploy's behavior during benchmarking, use the `--config` flag with a YAML configuration file:
 
 ```bash
 trtllm-bench \
@@ -32,7 +32,7 @@ trtllm-bench \
   throughput \
   --dataset /tmp/synthetic_128_128.txt \
   --backend _autodeploy \
-  --extra_llm_api_options autodeploy_config.yaml
+  --config autodeploy_config.yaml
 ```
 
 ### Configuration Examples
diff --git a/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md
index 6e52fe4ea4..20693f6170 100644
--- a/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md
+++ b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md
@@ -30,13 +30,13 @@ curl -s http://localhost:8000/v1/chat/completions \
 
 ## Configuration via YAML
 
-Use `--extra_llm_api_options` to supply a YAML file that augments or overrides server/runtime settings.
+Use `--config` to supply a YAML file that augments or overrides server/runtime settings.
 
 ```bash
 trtllm-serve \
   meta-llama/Llama-3.1-8B \
   --backend _autodeploy \
-  --extra_llm_api_options autodeploy_config.yaml
+  --config autodeploy_config.yaml
 ```
 
 Example `autodeploy_config.yaml`:
diff --git a/docs/source/torch/features/lora.md b/docs/source/torch/features/lora.md
index d00a27d49a..ccf7561efb 100644
--- a/docs/source/torch/features/lora.md
+++ b/docs/source/torch/features/lora.md
@@ -157,7 +157,7 @@ llm = LLM(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -170,7 +170,7 @@ lora_config:
 ```bash
 python -m tensorrt_llm.commands.serve
      /path/to/model \
-    --extra_llm_api_options extra_llm_api_options.yaml
+    --config config.yaml
 ```
 
 ### Client Usage
@@ -198,7 +198,7 @@ response = client.completions.create(
 
 ### YAML Configuration
 
-Create an `extra_llm_api_options.yaml` file:
+Create a `config.yaml` file:
 
 ```yaml
 lora_config:
@@ -220,5 +220,5 @@ lora_config:
 ### Run trtllm-bench
 
 ```bash
-trtllm-bench --model $model_path throughput --dataset $dataset_path --extra_llm_api_options extra-llm-api-options.yaml --num_requests 64 --concurrency 16
+trtllm-bench --model $model_path throughput --dataset $dataset_path --config config.yaml --num_requests 64 --concurrency 16
 ```
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000000..3159bfe656
--- /dev/null
+++ b/examples/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/configs/README.md b/examples/configs/README.md
index b9a47281d2..dc633c8b2c 100644
--- a/examples/configs/README.md
+++ b/examples/configs/README.md
@@ -1,5 +1,5 @@
 # Recommended LLM API Configuration Settings
 
-This directory contains recommended [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/) performance settings for popular models. They can be used out-of-the-box with `trtllm-serve` via the `--extra_llm_api_options` CLI flag, or you can adjust them to your specific use case.
+This directory contains recommended [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/) performance settings for popular models. They can be used out-of-the-box with `trtllm-serve` via the `--config` CLI flag, or you can adjust them to your specific use case.
 
 For model-specific deployment guides, please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/deployment-guide/index.html).
diff --git a/examples/configs/__init__.py b/examples/configs/__init__.py
new file mode 100644
index 0000000000..3159bfe656
--- /dev/null
+++ b/examples/configs/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/configs/database/__init__.py b/examples/configs/database/__init__.py
new file mode 100644
index 0000000000..3159bfe656
--- /dev/null
+++ b/examples/configs/database/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 8b99f8845f..64dd80cbdf 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -23,10 +23,10 @@ cache_transceiver_config:
   kv_transfer_sender_future_timeout_ms: <int>
 ```
 
-The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below.
+The following is an example, consisting of the `ctx_config.yaml` and `gen_config.yaml` files needed in the sections below.
 
 ```yaml
-# ctx_extra-llm-api-config.yaml
+# ctx_config.yaml
 
 # The overlap scheduler for context servers is currently disabled, as it is
 # not yet supported in disaggregated context server architectures.
@@ -37,7 +37,7 @@ cache_transceiver_config:
 ```
 
 ```yaml
-# gen_extra-llm-api-config.yaml
+# gen_config.yaml
 
 cache_transceiver_config:
   backend: UCX
@@ -54,16 +54,16 @@ Suppose we have three CUDA devices on the same machine. The first two devices ar
 # Start context servers
 CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8001 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 &
+    --config ./ctx_config.yaml &> log_ctx_0 &
 
 CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8002 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 &
+    --config ./ctx_config.yaml &> log_ctx_1 &
 
 # Start generation server
 CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8003 \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 &
+    --config ./gen_config.yaml &> log_gen_0 &
 ```
 
 Once the context and generation servers are launched, you can launch the disaggregated
@@ -131,16 +131,16 @@ After starting the node and entering interactive mode, you can run the following
 # Start context servers
 CUDA_VISIBLE_DEVICES=0 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8001 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 &
+    --config ./ctx_config.yaml &> log_ctx_0 &
 
 CUDA_VISIBLE_DEVICES=1 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8002 \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 &
+    --config ./ctx_config.yaml &> log_ctx_1 &
 
 # Start generation server
 CUDA_VISIBLE_DEVICES=2 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8003 \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 &
+    --config ./gen_config.yaml &> log_gen_0 &
 
 # Start proxy
 trtllm-llmapi-launch trtllm-serve disaggregated -c disagg_config.yaml
@@ -182,7 +182,7 @@ srun -A <account> -p <partition> -t <time> \
     --container-image=<container_image> \
     --container-mounts=<mount_paths> \
     --mpi=pmix \
-    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port $PORT --extra_llm_api_options $WORK/ctx_extra-llm-api-config.yaml"
+    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port $PORT --config $WORK/ctx_config.yaml"
 
 # Launch a generation with `tp_size=4` using one 4-GPU node.
 srun -A <account> -p <partition> -t <time> \
@@ -190,7 +190,7 @@ srun -A <account> -p <partition> -t <time> \
     --container-image=<container_image> \
     --container-mounts=<mount_paths> \
     --mpi=pmix \
-    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port $PORT --extra_llm_api_options $WORK/gen_extra-llm-api-config.yaml"
+    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port $PORT --config $WORK/gen_config.yaml"
 
 # Launch a proxy.
 # The above-mentioned value needs to be replaced with the IP address of the host machine accessible to external
@@ -241,20 +241,20 @@ Verify both checkpoints have the same KV cache dtype by checking `hf_quant_confi
 CUDA_VISIBLE_DEVICES=0 trtllm-serve meta-llama/Llama-3.1-8B-Instruct \
     --host localhost --port 8001 \
     --server_role CONTEXT \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml \
+    --config ./ctx_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_ctx_0 &
 
 CUDA_VISIBLE_DEVICES=1 trtllm-serve meta-llama/Llama-3.1-8B-Instruct \
     --host localhost --port 8002 \
     --server_role CONTEXT \
-    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml \
+    --config ./ctx_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_ctx_1 &
 
 # Start generation server with FP8 quantized checkpoint
 CUDA_VISIBLE_DEVICES=2 trtllm-serve ./weights/Llama-3.1-8B-Instruct-FP8-KV-BF16 \
     --host localhost --port 8003 \
     --server_role GENERATION \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml \
+    --config ./gen_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 
 # Start disaggregated server
@@ -308,11 +308,11 @@ After this, you can enable the dynamic scaling feature for the use case above as
 
 ```bash
 # Context servers
-CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001  --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_0 &
-CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002  --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_1 &
+CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001  --server_role CONTEXT --config ./ctx_config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_0 &
+CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002  --server_role CONTEXT --config ./ctx_config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_1 &
 
 # Generation servers
-CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003  --server_role GENERATION --extra_llm_api_options ./gen_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
+CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003  --server_role GENERATION --config ./gen_config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 ```
 
 As for the disaggregated server, you should also specify the --metadata_server_config_file like the following
@@ -339,7 +339,7 @@ Users can add servers by directly launching them with trtllm-serve. For example,
 CUDA_VISIBLE_DEVICES=3 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8004 \
      --server_role GENERATION \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yaml \
+    --config ./gen_config.yaml \
     --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 ```
 TensorRT LLM will automatically register any newly launched server with the ETCD server, allowing the router to send new requests to the added server.
diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
index e2ac1f7530..ae60ae8737 100644
--- a/examples/disaggregated/slurm/benchmark/start_worker.sh
+++ b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -63,4 +63,4 @@ fi
 ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
     trtllm-serve ${model_path} \
         --host $(hostname) --port ${port} \
-        --extra_llm_api_options ${config_file}
+        --config ${config_file}
diff --git a/examples/disaggregated/slurm/service_discovery_example/launch.slurm b/examples/disaggregated/slurm/service_discovery_example/launch.slurm
index 76cdaa6944..1b05ee1718 100644
--- a/examples/disaggregated/slurm/service_discovery_example/launch.slurm
+++ b/examples/disaggregated/slurm/service_discovery_example/launch.slurm
@@ -38,14 +38,14 @@ disagg_cluster:
   cluster_name: example_cluster
 EOL
 
-cat >${work_path}/ctx_extra-llm-api-config.yaml << EOL
+cat >${work_path}/ctx_config.yaml << EOL
 disable_overlap_scheduler: True
 cache_transceiver_config:
   backend: UCX
   max_tokens_in_buffer: 2048
 EOL
 
-cat >${work_path}/gen_extra-llm-api-config.yaml << EOL
+cat >${work_path}/gen_config.yaml << EOL
 cache_transceiver_config:
   backend: UCX
   max_tokens_in_buffer: 2048
@@ -63,11 +63,11 @@ srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 2 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --extra_llm_api_options ${work_path}/ctx_extra-llm-api-config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role context" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --config ${work_path}/ctx_config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role context" &
 
 # Launch a generation with `tp_size=4` using one 4-GPU node.
 srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 1 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --extra_llm_api_options ${work_path}/gen_extra-llm-api-config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role generation" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --config ${work_path}/gen_config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role generation" &
diff --git a/examples/disaggregated/slurm/simple_example/launch.slurm b/examples/disaggregated/slurm/simple_example/launch.slurm
index 6013b58162..577af570d6 100644
--- a/examples/disaggregated/slurm/simple_example/launch.slurm
+++ b/examples/disaggregated/slurm/simple_example/launch.slurm
@@ -17,14 +17,14 @@ srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 2 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --extra_llm_api_options ${work_path}/ctx_extra-llm-api-config.yaml" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --config ${work_path}/ctx_config.yaml" &
 
 # Launch a generation with `tp_size=4` using one 4-GPU node.
 srun --container-image=${container_image} \
      --container-mounts=${mount_paths} \
      -N 1 --ntasks-per-node=4 \
      --mpi=pmix \
-     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --extra_llm_api_options ${work_path}/gen_extra-llm-api-config.yaml" &
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --config ${work_path}/gen_config.yaml" &
 
 # Launch a proxy.
 # The above-mentioned value needs to be replaced with the IP address of the host machine accessible to external
diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh
index f8167966a8..50c657ad50 100644
--- a/examples/llm-api/llm_mgmn_trtllm_bench.sh
+++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh
@@ -125,6 +125,6 @@ EOF
             --dataset $data_path \
             --backend pytorch \
             --tp 16 \
-            --extra_llm_api_options /tmp/pytorch_extra_args.txt \
+            --config /tmp/pytorch_extra_args.txt \
             $EXTRA_ARGS
     "
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index 1bb67546f9..68e51564c3 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -148,7 +148,7 @@ trtllm-bench --model ${DS_R1_NVFP4_MODEL_PATH} \
         --input-stdev 0 --output-stdev 0 \
         --num-requests 24
 
-cat <<EOF > /tmp/extra-llm-api-config.yml
+cat <<EOF > /tmp/config.yml
 cuda_graph_config:
   enable_padding: true
   batch_sizes: [1, 4, 8, 12]
@@ -161,7 +161,7 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} t
         --max_batch_size 12 \
         --max_num_tokens 65548 \
         --kv_cache_free_gpu_mem_fraction 0.6 \
-        --extra_llm_api_options /tmp/extra-llm-api-config.yml
+        --config /tmp/config.yml
 ```
 
 #### ISL-128k-OSL-1024
@@ -175,7 +175,7 @@ trtllm-bench --model ${DS_R1_NVFP4_MODEL_PATH} \
         --input-stdev 0 --output-stdev 0 \
         --num-requests 4
 
-cat <<EOF > /tmp/extra-llm-api-config.yml
+cat <<EOF > /tmp/config.yml
 cuda_graph_config:
   enable_padding: true
   batch_sizes: [1, 2]
@@ -190,7 +190,7 @@ trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} t
         --max_batch_size 2 \
         --max_num_tokens 131074 \
         --kv_cache_free_gpu_mem_fraction 0.3 \
-        --extra_llm_api_options /tmp/extra-llm-api-config.yml
+        --config /tmp/config.yml
 ```
 
 ## Evaluation
@@ -199,7 +199,7 @@ Evaluate the model accuracy using `trtllm-eval`.
 
 1. (Optional) Prepare an advanced configuration file:
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 enable_attention_dp: true
 EOF
 ```
@@ -209,7 +209,7 @@ EOF
 trtllm-eval --model  <YOUR_MODEL_DIR> \
   --tp_size 8 \
   --kv_cache_free_gpu_memory_fraction 0.8 \
-  --extra_llm_api_options ./extra-llm-api-config.yml \
+  --config ./config.yml \
   mmlu
 ```
 
@@ -218,7 +218,7 @@ trtllm-eval --model  <YOUR_MODEL_DIR> \
 trtllm-eval --model  <YOUR_MODEL_DIR> \
   --tp_size 8 \
   --kv_cache_free_gpu_memory_fraction 0.8 \
-  --extra_llm_api_options ./extra-llm-api-config.yml \
+  --config ./config.yml \
   gsm8k
 ```
 
@@ -229,7 +229,7 @@ trtllm-eval --model  <YOUR_MODEL_DIR> \
 trtllm-eval --model  <YOUR_MODEL_DIR> \
   --tp_size 8 \
   --kv_cache_free_gpu_memory_fraction 0.8 \
-  --extra_llm_api_options ./extra-llm-api-config.yml \
+  --config ./config.yml \
   gpqa_diamond \
   --apply_chat_template
 ```
@@ -243,7 +243,7 @@ To serve the model using `trtllm-serve`:
 
 #### B200 FP4 min-latency config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 1024
@@ -257,7 +257,7 @@ EOF
 
 #### B200 FP4 max-throughput config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -294,7 +294,7 @@ EOF
 
 #### B200 FP8 min-latency config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 1024
@@ -312,7 +312,7 @@ EOF
 
 #### B200 FP8 max-throughput config
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 512
@@ -342,7 +342,7 @@ trtllm-serve \
   --tp_size 8 \
   --ep_size 8 \
   --pp_size 1 \
-  --extra_llm_api_options ./extra-llm-api-config.yml
+  --config ./config.yml
 ```
 It's possible seeing OOM issues on some configs. Considering reducing `kv_cache_free_gpu_mem_fraction` to a smaller value as a workaround. We're working on the investigation and addressing the problem. If you are using max-throughput config, reduce `max_num_tokens` to `3072` to avoid OOM issues.
 
@@ -370,7 +370,7 @@ For example, you can launch a single context server on port 8001 with:
 ```bash
 export TRTLLM_USE_UCX_KVCACHE=1
 
-cat >./ctx-extra-llm-api-config.yml <<EOF
+cat >./ctx_config.yml <<EOF
 print_iter_log: true
 enable_attention_dp: true
 EOF
@@ -386,7 +386,7 @@ trtllm-serve \
   --ep_size 8 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./ctx-extra-llm-api-config.yml &> output_ctx &
+  --config ./ctx_config.yml &> output_ctx &
 ```
 
 And you can launch two generation servers on port 8002 and 8003 with:
@@ -394,7 +394,7 @@ And you can launch two generation servers on port 8002 and 8003 with:
 ```bash
 export TRTLLM_USE_UCX_KVCACHE=1
 
-cat >./gen-extra-llm-api-config.yml <<EOF
+cat >./gen_config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -424,7 +424,7 @@ trtllm-serve \
   --ep_size 8 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./gen-extra-llm-api-config.yml \
+  --config ./gen_config.yml \
   &> output_gen_${port} & \
 done
 ```
@@ -483,7 +483,7 @@ The model configuration file is located at https://github.com/triton-inference-s
 model: <replace with the deepseek model or path to the checkpoints>
 backend: "pytorch"
 ```
-Additional configs similar to `extra-llm-api-config.yml` can be added to the yaml file and will be used to configure the LLM model. At the minimum, `tensor_parallel_size` needs to be set to 8 on H200 and B200 machines and 16 on H100.
+Additional configs similar to `config.yml` can be added to the yaml file and will be used to configure the LLM model. At the minimum, `tensor_parallel_size` needs to be set to 8 on H200 and B200 machines and 16 on H100.
 
 The initial loading of the model can take around one hour and the following runs will take advantage of the weight caching.
 
@@ -592,7 +592,7 @@ mpirun \
 -H <HOST1>:8,<HOST2>:8 \
 -mca plm_rsh_args "-p 2233" \
 --allow-run-as-root -n 16 \
-trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /models/DeepSeek-V3 throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/tensorrt_llm/dataset_isl1000.txt --tp 16 --ep 8 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options /workspace/tensorrt_llm/extra-llm-api-config.yml --concurrency 4096 --streaming
+trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /models/DeepSeek-V3 throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/tensorrt_llm/dataset_isl1000.txt --tp 16 --ep 8 --kv_cache_free_gpu_mem_fraction 0.95 --config /workspace/tensorrt_llm/config.yml --concurrency 4096 --streaming
 ```
 
 #### Slurm
@@ -604,20 +604,20 @@ trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /
   --container-image=<CONTAINER_IMG> \
   --container-mounts=/workspace:/workspace \
   --container-workdir /workspace \
-  bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
+  bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --config ./config.yml"
 ```
 
 
 #### Example: Multi-node benchmark on GB200 Slurm cluster
 
-Step 1: Prepare dataset and `extra-llm-api-config.yml`.
+Step 1: Prepare dataset and `config.yml`.
 ```bash
 trtllm-bench --model /path/to/DeepSeek-R1 \
     prepare-dataset --output /tmp/dataset.txt \
     token-norm-dist --num-requests=49152 \
     --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0
 
-cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
+cat >/path/to/TensorRT-LLM/config.yml <<EOF
 cuda_graph_config:
   enable_padding: true
   batch_sizes:
@@ -678,7 +678,7 @@ trtllm-llmapi-launch trtllm-bench \
     --concurrency 3072 \
     --dataset /path/to/dataset.txt \
     --tp 8 --pp 1 --ep 8 --kv_cache_free_gpu_mem_fraction 0.85 \
-    --extra_llm_api_options ./extra-llm-api-config.yml --warmup 0
+    --config ./config.yml --warmup 0
 ```
 
 Step 4: Submit the job to Slurm cluster to launch the benchmark by executing:
@@ -730,7 +730,7 @@ trtllm-bench \
       --tp 8 \
       --ep 8 \
       --kv_cache_free_gpu_mem_fraction 0.9 \
-      --extra_llm_api_options /workspace/extra-llm-api-config.yml \
+      --config /workspace/config.yml \
       --concurrency ${CONCURRENCY} \
       --num_requests ${NUM_REQUESTS} \
       --streaming \
@@ -751,7 +751,7 @@ mpirun -H <HOST1>:8,<HOST2>:8 \
       --tp 16 \
       --ep 16 \
       --kv_cache_free_gpu_mem_fraction 0.9 \
-      --extra_llm_api_options /workspace/extra-llm-api-config.yml \
+      --config /workspace/config.yml \
       --concurrency ${CONCURRENCY} \
       --num_requests ${NUM_REQUESTS} \
       --streaming \
@@ -790,7 +790,7 @@ To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following show
 
 **Option 2: PyTorch backend config**
 
-Alternatively, configure FP8 MLA through the `kv_cache_dtype` of the PyTorch backend config. An example is to use `--kv_cache_dtype` of `quickstart_advanced.py`. Also, you can edit `extra-llm-api-config.yml` consumed by `--extra_llm_api_options` of `trtllm-serve`, `trtllm-bench` and so on:
+Alternatively, configure FP8 MLA through the `kv_cache_dtype` of the PyTorch backend config. An example is to use `--kv_cache_dtype` of `quickstart_advanced.py`. Also, you can edit `config.yml` consumed by `--config` of `trtllm-serve`, `trtllm-bench` and so on:
 ```yaml
 # ...
 kv_cache_dtype: fp8
diff --git a/examples/models/core/gemma/README.md b/examples/models/core/gemma/README.md
index a78f3c8ffe..9f0085c5fe 100644
--- a/examples/models/core/gemma/README.md
+++ b/examples/models/core/gemma/README.md
@@ -688,7 +688,7 @@ For example, you can launch a single context server on port 8001 with:
 ```bash
 export TRTLLM_USE_UCX_KVCACHE=1
 
-cat >./ctx-extra-llm-api-config.yml <<EOF
+cat >./ctx_config.yml <<EOF
 print_iter_log: true
 disable_overlap_scheduler: true
 kv_cache_config:
@@ -705,14 +705,14 @@ trtllm-serve \
   --ep_size 2 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./ctx-extra-llm-api-config.yml \
+  --config ./ctx_config.yml \
   &> output_ctx_8001 &
 ```
 
 Then launch a single generation server on port 8002 with:
 
 ```bash
-cat >./gen-extra-llm-api-config.yml <<EOF
+cat >./gen_config.yml <<EOF
 print_iter_log: true
 kv_cache_config:
   max_attention_window: [512, 512, 512, 512, 512, 32768]
@@ -728,7 +728,7 @@ trtllm-serve \
   --ep_size 2 \
   --pp_size 1 \
   --kv_cache_free_gpu_memory_fraction 0.95 \
-  --extra_llm_api_options ./gen-extra-llm-api-config.yml \
+  --config ./gen_config.yml \
   &> output_gen_8002 &
 ```
 
diff --git a/examples/models/core/gpt_oss/README.md b/examples/models/core/gpt_oss/README.md
index 6606bc827f..85cb21f6eb 100644
--- a/examples/models/core/gpt_oss/README.md
+++ b/examples/models/core/gpt_oss/README.md
@@ -133,7 +133,7 @@ export TRITON_ROOT=/local/user/triton
 
 3. **Select Triton as the MoE backend**
 
-• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--extra_llm_api_options`:
+• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--config`:
 
 ```yaml
 moe_config:
diff --git a/examples/models/core/kimi_k2/README.md b/examples/models/core/kimi_k2/README.md
index 1dd3e353c5..c951ae29ac 100644
--- a/examples/models/core/kimi_k2/README.md
+++ b/examples/models/core/kimi_k2/README.md
@@ -56,7 +56,7 @@ The next section is an example that deploys the K2 model using TensorRT-LLM and
 First, launch a server using trtllm-serve:
 
 ```bash
-cat > ./extra_llm_api_options.yaml <<EOF
+cat > ./config.yaml <<EOF
 # define your extra parameters here
 cuda_graph_config:
   batch_sizes:
@@ -70,7 +70,7 @@ trtllm-serve  \
     --backend pytorch \
     --tp_size 8 \
     --ep_size 8 \
-    --extra_llm_api_options extra_llm_api_options.yaml
+    --config config.yaml
 ```
 
 Run the script [kimi_k2_tool_calling_example.py](./kimi_k2_tool_calling_example.py), which performs the following steps:
diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md
index df26ac1ad6..560296daa6 100644
--- a/examples/models/core/llama/README.md
+++ b/examples/models/core/llama/README.md
@@ -1542,7 +1542,7 @@ This section provides the steps to run LLaMa-3.3 70B model FP8 precision on PyTo
 
 ### Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 stream_interval: 10
 cuda_graph_config:
   max_batch_size: 1024
@@ -1566,7 +1566,7 @@ trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
     --max_batch_size 1024 \
     --trust_remote_code \
     --num_postprocess_workers 2 \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 ### Run performance benchmarks
diff --git a/examples/models/core/llama4/README.md b/examples/models/core/llama4/README.md
index a6c02070e9..450ea786b1 100644
--- a/examples/models/core/llama4/README.md
+++ b/examples/models/core/llama4/README.md
@@ -25,7 +25,7 @@ This section provides the steps to launch TensorRT LLM server and run performanc
 
 #### 1. Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 enable_attention_dp: true
 stream_interval: 10
 cuda_graph_config:
@@ -50,7 +50,7 @@ trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --ep_size 8 \
     --num_postprocess_workers 2 \
     --trust_remote_code \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 
@@ -75,7 +75,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
 
 #### 1. Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 enable_attention_dp: false
 enable_min_latency: true
 stream_interval: 10
@@ -101,7 +101,7 @@ trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --tp_size 8 \
     --ep_size 1 \
     --trust_remote_code \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 
@@ -125,7 +125,7 @@ python -m tensorrt_llm.serve.scripts.benchmark_serving \
 
 #### 1. Prepare TensorRT LLM extra configs
 ```bash
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 stream_interval: 10
 cuda_graph_config:
   max_batch_size: 1024
@@ -147,7 +147,7 @@ trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \
     --ep_size 2 \
     --num_postprocess_workers 2 \
     --trust_remote_code \
-    --extra_llm_api_options ./extra-llm-api-config.yml
+    --config ./config.yml
 ```
 
 
diff --git a/examples/models/core/mistral_large_3/README.md b/examples/models/core/mistral_large_3/README.md
index dfd3fd0c28..ffe2e50f79 100644
--- a/examples/models/core/mistral_large_3/README.md
+++ b/examples/models/core/mistral_large_3/README.md
@@ -35,7 +35,7 @@ checkpoint_format: mistral
 mpirun -n 1 --allow-run-as-root --oversubscribe python3 -m tensorrt_llm.commands.serve serve \
     ${mistral_large_3_model_path} \
     --host localhost --port 8001 --backend pytorch \
-    --extra_llm_api_options serve.yml \
+    --config serve.yml \
     --tokenizer ${mistral_large_3_model_path} \
     2>&1 | tee serve_debug.log &
 
diff --git a/examples/models/core/multimodal/README.md b/examples/models/core/multimodal/README.md
index d92ec168bb..96ba6102a8 100644
--- a/examples/models/core/multimodal/README.md
+++ b/examples/models/core/multimodal/README.md
@@ -414,7 +414,7 @@ trtllm-serve ${MODEL_NAME}/ \
     --tp_size 1 \
     --port 8000 \
     --max_batch_size 4 \
-    --extra_llm_api_options extra-llm-api-options.yaml
+    --config extra-llm-api-options.yaml
 ```
 
 ### Supported Model Variants
diff --git a/examples/models/core/nemotron/README_nano-v2-vl.md b/examples/models/core/nemotron/README_nano-v2-vl.md
index 8988df8a43..956ce4d8a9 100644
--- a/examples/models/core/nemotron/README_nano-v2-vl.md
+++ b/examples/models/core/nemotron/README_nano-v2-vl.md
@@ -47,7 +47,7 @@ TLLM_VIDEO_PRUNING_RATIO=0.9 python3 examples/llm-api/quickstart_multimodal.py -
 
 ```bash
 # Create extra config file.
-cat > ./extra-llm-api-config.yml << EOF
+cat > ./config.yml << EOF
 kv_cache_config:
   enable_block_reuse: false
   mamba_ssm_cache_dtype: float32
@@ -63,7 +63,7 @@ nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\
 --max_num_tokens 131072 \
 --trust_remote_code \
 --media_io_kwargs "{\"video\": {\"fps\": 2, \"num_frames\": 128} }" \
---extra_llm_api_options extra-llm-api-config.yml
+--config config.yml
 
 # CMD to launch serve with EVS (video_pruning_ratio=0.9).
 TLLM_VIDEO_PRUNING_RATIO=0.9 trtllm-serve  \
@@ -75,7 +75,7 @@ nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\
 --max_num_tokens 131072 \
 --trust_remote_code \
 --media_io_kwargs "{\"video\": {\"fps\": 2, \"num_frames\": 128} }" \
---extra_llm_api_options extra-llm-api-config.yml
+--config config.yml
 ```
 
 # Known issue:
diff --git a/examples/models/core/phi/phi4-mm.md b/examples/models/core/phi/phi4-mm.md
index 7d48cad2a2..1f7657b80c 100644
--- a/examples/models/core/phi/phi4-mm.md
+++ b/examples/models/core/phi/phi4-mm.md
@@ -22,7 +22,7 @@ python examples/llm-api/quickstart_multimodal.py --model_dir <model_folder_path>
 
 ### TRTLLM-serve
 ```
-cat > lora-extra-llm-api-config.yml<<EOF
+cat > lora_llmapi_config.yml<<EOF
 kv_cache_config:
     free_gpu_memory_fraction: 0.6
 lora_config:
@@ -46,7 +46,7 @@ trtllm-serve  \
 <model_folder_path> \
 --backend pytorch \
 --trust_remote_code \
---extra_llm_api_options lora-extra-llm-api-config.yml
+--config lora_llmapi_config.yml
 ```
 
 ```
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 051d7811f9..5474e25969 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -688,7 +688,7 @@ To run the benchmark, we suggest using the `trtllm-bench` tool. Please refer to
 #!/bin/bash
 
 folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/
-path_config=extra-llm-api-config.yml
+path_config=config.yml
 num_gpus=8
 ep_size=8
 max_input_len=1024
@@ -717,7 +717,7 @@ trtllm-bench --model ${folder_model} --model_path ${folder_model} throughput \
   --tp ${num_gpus}\
   --ep ${ep_size} \
   --kv_cache_free_gpu_mem_fraction ${kv_cache_free_gpu_mem_fraction} \
-  --extra_llm_api_options ${path_config} \
+  --config ${path_config} \
   --concurrency ${concurrency} \
   --num_requests $(( concurrency * 5 )) \
   --warmup 0 \
@@ -756,7 +756,7 @@ EXTRA_LLM_API_FILE=${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml
 To serve the model using `trtllm-serve`:
 
 ```bash
-trtllm-serve Qwen3-30B-A3B/ --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+trtllm-serve Qwen3-30B-A3B/ --port 8000 --config ${EXTRA_LLM_API_FILE}
 ```
 
 To query the server, you can start with a `curl` command:
@@ -781,7 +781,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 export TRTLLM_DIR=/app/tensorrt_llm
 export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/curated/qwen3-disagg-prefill.yaml"
 
-trtllm-serve Qwen3-30B-A3B/ --port 8001 --extra_llm_api_options ${EXTRA_LLM_API_FILE} &> output_ctx &
+trtllm-serve Qwen3-30B-A3B/ --port 8001 --config ${EXTRA_LLM_API_FILE} &> output_ctx &
 ```
 
 And you can launch two generation servers on port 8002 and 8003 with:
@@ -792,7 +792,7 @@ export TRTLLM_DIR=/app/tensorrt_llm
 export EXTRA_LLM_API_FILE="${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml"
 
 for port in {8002..8003}; do \
-trtllm-serve Qwen3-30B-A3B/ --port ${port} --extra_llm_api_options ${EXTRA_LLM_API_FILE} &> output_gen_${port} & \
+trtllm-serve Qwen3-30B-A3B/ --port ${port} --config ${EXTRA_LLM_API_FILE} &> output_gen_${port} & \
 done
 ```
 
@@ -849,7 +849,7 @@ Currently, there are some limitations when enabling Eagle3:
 1. `attention_dp` is not supported. Please disable it or do not set the related flag (it is disabled by default).
 2. If you want to use `enable_block_reuse`, the kv cache type of the target model and the draft model must be the same. Since the draft model only supports fp16/bf16, you need to disable `enable_block_reuse` when using fp8 kv cache.
 
-Example `extra-llm-api-config.yml` snippet for Eagle3:
+Example `config.yml` snippet for Eagle3:
 
 ```bash
 echo "
diff --git a/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh b/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh
index 61ee520161..00291738a7 100644
--- a/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh
+++ b/examples/ray_orchestrator/disaggregated/disagg_serving_local.sh
@@ -129,14 +129,14 @@ if [[ "$BACKEND" == "mpi" ]]; then
     export CUDA_VISIBLE_DEVICES=0
 fi
 
-trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --extra_llm_api_options extra_llm_config.yaml &> output_ctx0 &
+trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8001 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --config extra_llm_config.yaml &> output_ctx0 &
 
 if [[ "$BACKEND" == "mpi" ]]; then
     export CUDA_VISIBLE_DEVICES=1
 fi
 # Launching generation servers
 echo "Launching generation servers..."
-trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --extra_llm_api_options extra_llm_config.yaml &> output_gen0 &
+trtllm-serve $MODEL_DIR --host localhost --tp_size $TP_SIZE --port 8002 --kv_cache_free_gpu_memory_fraction 0.15 --backend pytorch --config extra_llm_config.yaml &> output_gen0 &
 
 # Launching disaggregated server
 echo "Launching disaggregated server..."
diff --git a/examples/serve/deepseek_r1_reasoning_parser.sh b/examples/serve/deepseek_r1_reasoning_parser.sh
index b2336d02b2..9117921655 100644
--- a/examples/serve/deepseek_r1_reasoning_parser.sh
+++ b/examples/serve/deepseek_r1_reasoning_parser.sh
@@ -1,6 +1,6 @@
 #! /usr/bin/env bash
 
-cat >./extra-llm-api-config.yml <<EOF
+cat >./config.yml <<EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 512
@@ -19,5 +19,5 @@ trtllm-serve \
     --trust_remote_code \
     --max_batch_size 1024 --max_num_tokens 8192 \
     --tp_size 8 --ep_size 8 --pp_size 1 \
-    --extra_llm_api_options ./extra-llm-api-config.yml \
+    --config ./config.yml \
     --reasoning_parser deepseek-r1
diff --git a/examples/serve/openai_completion_client_json_schema.py b/examples/serve/openai_completion_client_json_schema.py
index 56e5a351a0..6dd2a15bd2 100644
--- a/examples/serve/openai_completion_client_json_schema.py
+++ b/examples/serve/openai_completion_client_json_schema.py
@@ -1,7 +1,7 @@
 ### :title OpenAI Completion Client with JSON Schema
 
 # This example requires to specify `guided_decoding_backend` as
-# `xgrammar` or `llguidance` in the extra_llm_api_options.yaml file.
+# `xgrammar` or `llguidance` in the config.yaml file.
 import json
 
 from openai import OpenAI
diff --git a/examples/sparse_attention/RocketKV.md b/examples/sparse_attention/RocketKV.md
index c320350c9f..178fe04d5b 100644
--- a/examples/sparse_attention/RocketKV.md
+++ b/examples/sparse_attention/RocketKV.md
@@ -93,7 +93,7 @@ python3 ../llm-api/llm_sparse_attention.py \
 
 ### Usage with `trtllm-bench` and `trtllm-serve`
 
-Sparse attention options must be specified via `--extra_llm_api_options config.yaml` for both `trtllm-bench` and `trtllm-serve`. All sparse attetnion options can be specified in this YAML file and the argument names/valid values are the same as in their corresponding configuration described in the Configuration Arguments section. For example, a YAML configuration could look like this:
+Sparse attention options must be specified via `--config config.yaml` for both `trtllm-bench` and `trtllm-serve`. All sparse attetnion options can be specified in this YAML file and the argument names/valid values are the same as in their corresponding configuration described in the Configuration Arguments section. For example, a YAML configuration could look like this:
 
 ```
 backend: pytorch
@@ -110,13 +110,13 @@ enable_chunked_prefill: false
 
 Run the command with the config file:
 ```bash
-trtllm-eval/trtllm-bench/trtllm-serve --model <model_path> --extra_llm_api_options extra_config.yaml ...
+trtllm-eval/trtllm-bench/trtllm-serve --model <model_path> --config extra_config.yaml ...
 ```
 
 For example, users can evaluate a model with trtllm-eval on LongBenchV2 task like this:
 
 ```bash
-trtllm-eval --model <path_to_model> --extra_llm_api_options extra_config.yaml longbench_v2 --max_output_length 1024 ...
+trtllm-eval --model <path_to_model> --config extra_config.yaml longbench_v2 --max_output_length 1024 ...
 ```
 
 ## Configuration Arguments
diff --git a/examples/wide_ep/ep_load_balancer/README.md b/examples/wide_ep/ep_load_balancer/README.md
index bb324a132b..ed143df741 100644
--- a/examples/wide_ep/ep_load_balancer/README.md
+++ b/examples/wide_ep/ep_load_balancer/README.md
@@ -24,7 +24,7 @@ Prepare a dataset following the [benchmarking documentation](https://github.com/
 Run 32-way expert parallelism inference on the prepared dataset. Please refer to the [LLM API MGMN example](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/llm_mgmn_trtllm_bench.sh) for details on running `trtllm-bench` on Slurm.
 
 ```bash
-cat > ./extra_llm_api_options.yaml <<EOF
+cat > ./config.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
 moe_backend: WideEP
@@ -39,7 +39,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 32 \
     --ep 32 \
-    --extra_llm_api_options ./extra_llm_api_options.yaml \
+    --config ./config.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --dataset ./dataset.json \
     --warmup 0 \
@@ -115,7 +115,7 @@ export EXPERT_STATISTIC_ITER_RANGE=100-200
 Run 36-way expert parallelism inference with the EPLB configuration incorporated:
 
 ```bash
-cat > ./extra_llm_api_options_eplb.yaml <<EOF
+cat > ./config_eplb.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
 moe_config:
@@ -130,7 +130,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 36 \
     --ep 36 \
-    --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
+    --config ./config_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --dataset ./dataset.json \
     --warmup 0 \
@@ -181,7 +181,7 @@ EOF
 Run 36-way expert parallelism inference with the EPLB configuration incorporated:
 
 ```bash
-cat > ./extra_llm_api_options_eplb.yaml <<EOF
+cat > ./config_eplb.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
 moe_config:
@@ -196,7 +196,7 @@ trtllm-bench --model ${MODEL_NAME} \
     throughput \
     --tp 36 \
     --ep 36 \
-    --extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
+    --config ./config_eplb.yaml \
     --kv_cache_free_gpu_mem_fraction 0.75 \
     --dataset ./dataset.json \
     --warmup 0 \
diff --git a/scripts/generate_config_table.py b/scripts/generate_config_table.py
index 3c68c7edcb..724b71397e 100644
--- a/scripts/generate_config_table.py
+++ b/scripts/generate_config_table.py
@@ -19,10 +19,19 @@ import sys
 from collections import defaultdict
 from pathlib import Path
 
-from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList, assign_profile
-
 SCRIPT_DIR = Path(__file__).parent.resolve()
 REPO_ROOT = SCRIPT_DIR.parent
+
+# Add repo root to path for examples.configs.database import
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from examples.configs.database.database import (  # noqa: E402
+    DATABASE_LIST_PATH,
+    RecipeList,
+    assign_profile,
+)
+
 MODEL_INFO = {
     "deepseek-ai/DeepSeek-R1-0528": {
         "display_name": "DeepSeek-R1",
@@ -57,7 +66,7 @@ def generate_rst(yaml_path, output_file=None):
     lines = []
 
     # Include note_sections.rst at the top (relative include for Sphinx)
-    lines.append(".. include:: note_sections.rst")
+    lines.append(".. include:: ../_includes/note_sections.rst")
     lines.append("   :start-after: .. start-note-traffic-patterns")
     lines.append("   :end-before: .. end-note-traffic-patterns")
     lines.append("")
@@ -115,7 +124,7 @@ def generate_rst(yaml_path, output_file=None):
                 profile = assign_profile(n, idx, conc)
 
                 full_config_path = config_path
-                command = f"trtllm-serve {model} --extra_llm_api_options ${{TRTLLM_DIR}}/{full_config_path}"
+                command = f"trtllm-serve {model} --config ${{TRTLLM_DIR}}/{full_config_path}"
 
                 config_filename = os.path.basename(full_config_path)
 
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 894114c0f4..49dd8d8252 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -177,7 +177,7 @@ def launch_disaggregated_llm(
 
         ctx_server_args = ctx_args + [
             "--port",
-            str(port), "--extra_llm_api_options", ctx_server_config_path,
+            str(port), "--config", ctx_server_config_path,
             f"--tp_size={ctx_tp}", f"--pp_size={ctx_pp}", f"--cp_size={ctx_cp}"
         ]
         if "max_num_tokens" in ctx_server_config:
@@ -200,7 +200,7 @@ def launch_disaggregated_llm(
 
         gen_server_args = gen_args + [
             "--port",
-            str(port), "--extra_llm_api_options", gen_server_config_path,
+            str(port), "--config", gen_server_config_path,
             f"--tp_size={gen_tp}", f"--pp_size={gen_pp}", f"--cp_size={gen_cp}"
         ]
         if "max_num_tokens" in gen_server_config:
diff --git a/tests/integration/defs/disaggregated/test_auto_scaling.py b/tests/integration/defs/disaggregated/test_auto_scaling.py
index 488cac6979..f270570e1e 100644
--- a/tests/integration/defs/disaggregated/test_auto_scaling.py
+++ b/tests/integration/defs/disaggregated/test_auto_scaling.py
@@ -159,7 +159,7 @@ def _run_worker(model_name,
             "localhost",
             "--port",
             str(port),
-            "--extra_llm_api_options",
+            "--config",
             worker_config_path,
             "--server_role",
             "context" if role.startswith("ctx") else "generation",
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index a0d325c737..72b42cd592 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -207,8 +207,8 @@ def generate_worker_commands(model_path, config, server_config,
         host, port = url.split(':')
         cmd = [
             'trtllm-serve', model_path, '--host', host, '--port', port,
-            '--backend', config['backend'], '--extra_llm_api_options',
-            extra_config_file, '--server_role', server_role
+            '--backend', config['backend'], '--config', extra_config_file,
+            '--server_role', server_role
         ]
         worker_commands.append(cmd)
     return worker_commands
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
index a495f35faf..aa3a293011 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
@@ -61,7 +61,7 @@ def start_context_server(config,
     """Start a context server on specified GPU and port."""
     cmd = [
         "trtllm-serve", config['model_path'], "--host", "localhost", "--port",
-        str(port), "--extra_llm_api_options", f"./{CONTEXT_CONFIG_FILE}",
+        str(port), "--config", f"./{CONTEXT_CONFIG_FILE}",
         "--metadata_server_config_file", ETCD_CONFIG_FILE, "--server_role",
         "CONTEXT"
     ]
@@ -87,7 +87,7 @@ def start_generation_server(config,
     """Start a generation server on specified GPU and port."""
     cmd = [
         "trtllm-serve", config['model_path'], "--host", "localhost", "--port",
-        str(port), "--extra_llm_api_options", f"./{GENERATION_CONFIG_FILE}",
+        str(port), "--config", f"./{GENERATION_CONFIG_FILE}",
         "--metadata_server_config_file", ETCD_CONFIG_FILE, "--server_role",
         "GENERATION"
     ]
diff --git a/tests/integration/defs/perf/README_release_test.md b/tests/integration/defs/perf/README_release_test.md
index 2cfbc5ed7e..cf96278c34 100644
--- a/tests/integration/defs/perf/README_release_test.md
+++ b/tests/integration/defs/perf/README_release_test.md
@@ -98,10 +98,10 @@ if self._config.backend == "pytorch":
     config = get_model_yaml_config(self._config.to_string(),
                                    lora_dirs=self.lora_dirs)
     print_info(f"pytorch model config: {config}")
-    with open('extra-llm-api-config.yml', 'w') as f:
+    with open('config.yml', 'w') as f:
         yaml.dump(config, f, default_flow_style=False)
     benchmark_cmd += [
-        f"--extra_llm_api_options=extra-llm-api-config.yml"
+        f"--config=config.yml"
     ]
 ```
 
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index f6d81460fe..980f0d1160 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -626,8 +626,8 @@ class ServerConfig:
             numa_bind_cmd = ["numactl", "-m 0,1"]
 
         cmd = numa_bind_cmd + [
-            "trtllm-serve", self.model_path, "--backend", "pytorch",
-            "--extra_llm_api_options", config_path
+            "trtllm-serve", self.model_path, "--backend", "pytorch", "--config",
+            config_path
         ]
         return cmd
 
@@ -2041,9 +2041,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                 print_info(f"pytorch/TRT model config: {config}")
                 with open(pytorch_config_path, 'w') as f:
                     yaml.dump(config, f, default_flow_style=False)
-                benchmark_cmd += [
-                    f"--extra_llm_api_options={pytorch_config_path}"
-                ]
+                benchmark_cmd += [f"--config={pytorch_config_path}"]
                 # If guided_decoding_backend is set, we need to initialize tokenizer
                 if config.get('guided_decoding_backend') is not None:
                     benchmark_cmd += ["--no_skip_tokenizer_init"]
@@ -2071,9 +2069,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
             print_info(f"_autodeploy model config: {autodeploy_config}")
             with open(autodeploy_config_path, 'w') as f:
                 yaml.dump(autodeploy_config, f, default_flow_style=False)
-            benchmark_cmd += [
-                f"--extra_llm_api_options={autodeploy_config_path}"
-            ]
+            benchmark_cmd += [f"--config={autodeploy_config_path}"]
         # for sampler options
         sampler_options_path = os.path.join(engine_dir, "sampler_options.yml")
         if not os.path.exists(sampler_options_path):
@@ -2849,8 +2845,8 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
                 self._config.gen_server_workers)
         ])
 
-        ctx_cmd = f'CUDA_VISIBLE_DEVICES={ctx_gpu_list} trtllm-serve {model_dir} --host localhost --port 8001 --extra_llm_api_options {ctx_config_path}'
-        gen_cmd = f'CUDA_VISIBLE_DEVICES={gen_gpu_list} trtllm-serve {model_dir} --host localhost --port 8002 --extra_llm_api_options {gen_config_path}'
+        ctx_cmd = f'CUDA_VISIBLE_DEVICES={ctx_gpu_list} trtllm-serve {model_dir} --host localhost --port 8001 --config {ctx_config_path}'
+        gen_cmd = f'CUDA_VISIBLE_DEVICES={gen_gpu_list} trtllm-serve {model_dir} --host localhost --port 8002 --config {gen_config_path}'
         return ctx_cmd, gen_cmd
 
     def _get_disagg_server_deploy_command(self):
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index da97a675f0..ee6aae803a 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -589,7 +589,7 @@ def stress_test(config,
         str(test_server_config.max_num_tokens),
         "--kv_cache_free_gpu_memory_fraction",
         str(test_server_config.kv_cache_free_gpu_memory_fraction),
-        "--extra_llm_api_options",
+        "--config",
         extra_llm_options_path,
     ])
 
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index b75781bf0f..ab8ff625f4 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -559,7 +559,7 @@ class BenchRunner:
             benchmark_cmd += " --backend tensorrt"
 
         if self.extra_llm_api_options:
-            benchmark_cmd += f" --extra_llm_api_options {self.extra_llm_api_options}"
+            benchmark_cmd += f" --config {self.extra_llm_api_options}"
         if self.concurrency:
             benchmark_cmd += f" --concurrency {self.concurrency}"
         if self.num_requests:
@@ -723,7 +723,7 @@ def test_trtllm_bench_invalid_token_pytorch(llm_root, llm_venv, model_name,
                 f"--model_path {llama_model_root} " \
                 f"throughput " \
                 f"--dataset {str(dataset_path)} --backend pytorch " \
-                f"--extra_llm_api_options {extra_options_path} " \
+                f"--config {extra_options_path} " \
                 f"> {output_path} 2>&1"
         # Check clean shutdown (no hang)
         with pytest.raises(subprocess.CalledProcessError) as exc_info:
@@ -899,7 +899,7 @@ def test_trtllm_bench_sanity(llm_root, llm_venv, engine_dir, model_subdir,
 
     assert not pytorch_backend_config
     if use_extra_config:
-        benchmark_cmd += f" --extra_llm_api_options {temp_extra_llm_api_options_file}"
+        benchmark_cmd += f" --config {temp_extra_llm_api_options_file}"
     check_call(benchmark_cmd, shell=True)
 
 
@@ -950,7 +950,7 @@ def test_trtllm_bench_pytorch_backend_sanity(llm_root, llm_venv,
         "Meta-Llama-3.1-8B-NVFP4": 10.2
     }
     if use_extra_config:
-        benchmark_cmd += f" --extra_llm_api_options {temp_extra_llm_api_options_file}"
+        benchmark_cmd += f" --config {temp_extra_llm_api_options_file}"
 
     model_id = llama_model_root.split(r"/")[-1]
     if "nvfp4-quantized" in llama_model_root:
diff --git a/tests/unittest/tools/test_config_database_sync.py b/tests/unittest/tools/test_config_database_sync.py
index 92a4243166..5910e79add 100644
--- a/tests/unittest/tools/test_config_database_sync.py
+++ b/tests/unittest/tools/test_config_database_sync.py
@@ -13,23 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib.util
 import os
-import sys
 import tempfile
 import unittest
 from pathlib import Path
 
-# Add scripts directory to path
-REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
-SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts")
-sys.path.insert(0, SCRIPTS_DIR)
+REPO_ROOT = Path(__file__).parent.parent.parent.parent.resolve()
 
-from generate_config_database_tests import (  # noqa: E402
-    PERF_SANITY_DIR,
-    TEST_LIST_PATH,
-    generate_tests,
+# Dynamically load generate_config_table module without modifying sys.path
+_spec = importlib.util.spec_from_file_location(
+    "generate_config_table", REPO_ROOT / "scripts" / "generate_config_table.py"
 )
-from generate_config_table import generate_rst  # noqa: E402
+_module = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_module)
+generate_rst = _module.generate_rst
+
+# Dynamically load generate_config_database_tests module without modifying sys.path
+_db_spec = importlib.util.spec_from_file_location(
+    "generate_config_database_tests",
+    REPO_ROOT / "scripts" / "generate_config_database_tests.py",
+)
+_db_module = importlib.util.module_from_spec(_db_spec)
+_db_spec.loader.exec_module(_db_module)
+generate_tests = _db_module.generate_tests
+TEST_LIST_PATH = _db_module.TEST_LIST_PATH
+PERF_SANITY_DIR = _db_module.PERF_SANITY_DIR
 
 
 class TestConfigDatabaseSync(unittest.TestCase):

From b882393d697bc485dd16fc11ccbb1fc4e5a1832d Mon Sep 17 00:00:00 2001
From: longcheng-nv <243710427+longcheng-nv@users.noreply.github.com>
Date: Sat, 20 Dec 2025 03:58:01 +0800
Subject: [PATCH 20/25] [https://nvbugs/5720357][fix] Fix indice offset
 overflow in custom Top-K kernel and corresponding UT case (#10027)

Signed-off-by: longcheng-nv <243710427+longcheng-nv@users.noreply.github.com>
Co-authored-by: Chang Liu (Enterprise Products) <9713593+chang-l@users.noreply.github.com>
---
 cpp/tensorrt_llm/kernels/indexerTopK.cu       | 16 +++----
 .../_torch/thop/parallel/test_indexer_topk.py | 47 ++++++++++++++++---
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/indexerTopK.cu b/cpp/tensorrt_llm/kernels/indexerTopK.cu
index 740e83f0bb..b9a3849494 100644
--- a/cpp/tensorrt_llm/kernels/indexerTopK.cu
+++ b/cpp/tensorrt_llm/kernels/indexerTopK.cu
@@ -606,8 +606,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
     int rowEnd = rowEnds[rowIdx];
 
     // Local pointers to this block
-    outIndices += rowIdx * topK;
-    logits += rowIdx * stride0;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
+    logits += static_cast<int64_t>(rowIdx) * stride0;
 
     topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
         nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
@@ -638,23 +638,23 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(f
     // Local pointers to this block
     if constexpr (!multipleBlocksPerRow && !mergeBlocks)
     {
-        outIndices += rowIdx * topK;
+        outIndices += static_cast<int64_t>(rowIdx) * topK;
     }
     else if constexpr (multipleBlocksPerRow)
     {
         auto const blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192
         rowStart = blockSize * blockIdx.y;         // 8192 * 1 = 8192
         rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
-        outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
-        outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+        outIndices += static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
+        outLogits += static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
     }
     else if constexpr (mergeBlocks)
     {
         rowEnd = numBlocksToMerge * topK;
-        indices += rowIdx * numBlocksToMerge * topK;
-        outIndices += rowIdx * topK;
+        indices += static_cast<int64_t>(rowIdx) * numBlocksToMerge * topK;
+        outIndices += static_cast<int64_t>(rowIdx) * topK;
     }
-    logits += rowIdx * stride0;
+    logits += static_cast<int64_t>(rowIdx) * stride0;
 
     topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort, multipleBlocksPerRow, mergeBlocks>(
         indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK);
diff --git a/tests/unittest/_torch/thop/parallel/test_indexer_topk.py b/tests/unittest/_torch/thop/parallel/test_indexer_topk.py
index 846c6a4b10..7e8c28b420 100644
--- a/tests/unittest/_torch/thop/parallel/test_indexer_topk.py
+++ b/tests/unittest/_torch/thop/parallel/test_indexer_topk.py
@@ -1,9 +1,31 @@
 import pytest
 import torch
+from utils.util import getSMVersion, skip_pre_hopper
 
 # Import tensorrt_llm to load custom CUDA operators (indexer_topk_decode, indexer_topk_prefill)
 import tensorrt_llm  # noqa: F401
 
+if not torch.cuda.is_available():
+    pytest.skip("CUDA is required for indexer_topk tests", allow_module_level=True)
+
+
+def _prefill_param_values():
+    """
+    Decide parameter coverage based on GPU architecture (SM version).
+
+    - pre-Hopper (SM < 90): skip via @skip_pre_hopper
+    - Hopper (SM == 90): reduced coverage
+    - Blackwell (SM >= 100): full coverage
+    """
+    sm = getSMVersion()
+    if sm >= 100:  # Blackwell family
+        return [1, 32], [4096, 8192, 32768]
+    # Hopper (and other >= 90 but < 100, if any): reduced coverage
+    return [1, 4], [4096, 8192, 32768]
+
+
+_PREFILL_BATCH_SIZES, _PREFILL_NUM_TOKENS = _prefill_param_values()
+
 
 def create_random_logits(
     row_starts: torch.Tensor,
@@ -197,27 +219,38 @@ def test_indexer_topk_decode(batch_size, next_n, index_topk, num_tokens):
     ), "CUDA top_k_per_row results don't match torch.topk"
 
 
-@pytest.mark.parametrize("batch_size", [1, 512, 2048])
+@skip_pre_hopper
+@pytest.mark.parametrize("batch_size", _PREFILL_BATCH_SIZES)
 @pytest.mark.parametrize("index_topk", [2048, 128])
-@pytest.mark.parametrize("num_tokens", [4096, 8192])
+@pytest.mark.parametrize("num_tokens", _PREFILL_NUM_TOKENS)
 def test_indexer_topk_prefill(batch_size, index_topk, num_tokens):
     torch.manual_seed(24)
     torch.cuda.manual_seed(24)
 
-    # Set input data
-    row_starts = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
-    row_ends = torch.arange(1, batch_size + 1, device="cuda", dtype=torch.int32)
+    # gen random input for the sequence length
+    seq_lens = generate_seq_lens(batch_size, index_topk, num_tokens)
+    num_gen_tokens = seq_lens.sum()
 
+    # gen the row_starts and row_ends (from 1 to ...)
+    row_starts = torch.zeros(num_gen_tokens, dtype=torch.int32, device="cuda")
+    row_indices = torch.arange(1, seq_lens.max() + 1, dtype=torch.int32, device="cuda")
+    row_ends = row_indices.expand(seq_lens.size(0), -1)[
+        row_indices.expand(seq_lens.size(0), -1) <= seq_lens.unsqueeze(1)
+    ].contiguous()
+
+    # gen logits
     logits = create_random_logits(row_starts, row_ends, torch.float32, 42)
 
     # Create output tensors
-    indices = torch.empty((batch_size, index_topk), dtype=torch.int32, device="cuda")
+    indices = torch.empty((num_gen_tokens, index_topk), dtype=torch.int32, device="cuda")
 
     # Run CUDA implementation
     torch.ops.trtllm.indexer_topk_prefill(logits, row_starts, row_ends, indices, index_topk)
+    torch.cuda.synchronize()
 
     # Run reference implementation
-    torch_indices = logits.topk(min(index_topk, max(row_ends)), dim=-1)[1]
+    max_row_len = row_ends.max().item()
+    torch_indices = logits.topk(min(index_topk, max_row_len), dim=-1)[1]
     mask_lo = torch_indices >= 0
     mask_hi = (torch_indices - (row_ends - row_starts)[:, None]) < 0
     mask = mask_lo & mask_hi

From 5489d188a44b8501cf35bc6b78a937b227da173f Mon Sep 17 00:00:00 2001
From: Chang Liu <9713593+chang-l@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:00:55 -0800
Subject: [PATCH 21/25] [None][fix] Revert the change and remove device count
 guard for DSv32 (#9631)

Signed-off-by: Chang Liu (Enterprise Products) <9713593+chang-l@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 4bc5920d7b..c8a9a7d1de 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2644,7 +2644,6 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3.2-Exp-hf"
 
     @pytest.mark.skip_less_mpi_world_size(8)
-    @pytest.mark.skip_less_device(8)
     @skip_pre_hopper
     @pytest.mark.skip_less_device_memory(140000)
     @pytest.mark.parametrize(
@@ -2722,7 +2721,6 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness):
                 task.evaluate(llm)
 
     @pytest.mark.skip_less_mpi_world_size(8)
-    @pytest.mark.skip_less_device(8)
     @skip_pre_blackwell
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend,skip_indexer",
@@ -2792,7 +2790,6 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness):
                 task.evaluate(llm)
 
     @pytest.mark.skip_less_mpi_world_size(8)
-    @pytest.mark.skip_less_device(8)
     @skip_pre_blackwell
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",

From 20b69a982a5d64dabcfd586fbab91c9f9460925e Mon Sep 17 00:00:00 2001
From: Gal Hubara-Agam <96368689+galagam@users.noreply.github.com>
Date: Fri, 19 Dec 2025 23:28:42 +0200
Subject: [PATCH 22/25] [#10056][test] AutoDeploy: Add accuracy test for
 Nemotron SuperV3 (#10131)

Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com>
Signed-off-by: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com>
Co-authored-by: Chenghao Zhang <211069071+nvchenghaoz@users.noreply.github.com>
---
 .../defs/accuracy/references/gsm8k.yaml       |  2 +
 .../defs/accuracy/references/mmlu.yaml        |  2 +
 .../defs/accuracy/test_llm_api_autodeploy.py  | 47 +++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 1e0c472ae8..688ffeb0be 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -289,6 +289,8 @@ bigcode/starcoder2-15b:
   - accuracy: 54.5
 mistral/Mistral-Large-3-675B:
   - accuracy: 90.83
+nvidia/Nemotron-Super-V3:
+  - accuracy: 84.38
 nvidia/Nemotron-3-Nano:
   - accuracy: 69.37
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 01a749d124..59fcb70c45 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -346,6 +346,8 @@ mistralai/Mistral-Nemo-12b-Base:
     accuracy: 69.66
 mistral/Mistral-Large-3-675B:
   - accuracy: 87.54
+nvidia/Nemotron-Super-V3:
+  - accuracy: 79.41
 nvidia/Nemotron-3-Nano:
   - accuracy: 73.85
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index aa5a5ca37a..c8adaa9684 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -232,3 +232,50 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
             task.evaluate(llm, sampling_params=sampling_params)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Nemotron-Super-V3"
+    MODEL_PATH_BF16 = "/scratch/models/super-v3-iter_0440000/hf"  # add to llm_models_root? I don't have permissions
+
+    def get_default_kwargs(self):
+        return {
+            "skip_tokenizer_init": False,
+            "trust_remote_code": True,
+            "skip_loading_weights": False,
+            "compile_backend": "torch-cudagraph",
+            "free_mem_ratio": 0.5,  # maybe we can increase
+            "max_batch_size": 128,
+            "max_seq_len": 8192,
+            "max_num_tokens": 8192,
+            "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
+            "transforms": {
+                "detect_sharding": {
+                    "sharding_source": ['factory', 'heuristic'],
+                    "sharding_dims": ['ep', 'bmm'],
+                },
+            }
+        }
+
+    def get_default_sampling_params(self):
+        eos_id = -1
+        beam_width = 1
+        return SamplingParams(end_id=eos_id,
+                              pad_id=eos_id,
+                              n=beam_width,
+                              use_beam_search=beam_width > 1)
+
+    @pytest.mark.skip_less_device_memory(
+        32000)  # might need to require more memory
+    @pytest.mark.skip_less_device(8)
+    def test_bf16(self):
+        kwargs = self.get_default_kwargs()
+        sampling_params = self.get_default_sampling_params()
+        with AutoDeployLLM(model=self.MODEL_PATH_BF16,
+                           tokenizer=self.MODEL_PATH_BF16,
+                           world_size=8,
+                           **kwargs) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=sampling_params)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)

From bee90514847461ebfcd2971e5a31286440fd6f8d Mon Sep 17 00:00:00 2001
From: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
Date: Fri, 19 Dec 2025 14:56:33 -0800
Subject: [PATCH 23/25] [None][chore] Waive timing out pre-merge test (#10167)

Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 4d2aa7ab7b..4cbee7e5a9 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -476,6 +476,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_gra
 disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/5748564)
 disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/5755963)
 disaggregated/test_auto_scaling.py::test_service_discovery[etcd-load_balancing] SKIP (https://nvbugs/5757415)
+disaggregated/test_auto_scaling.py::test_service_discovery[http-kv_cache_aware] SKIP (https://nvbugs/5758225)
 unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516)
 unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521)
 cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)

From 7c82605327baf4ee12e8fd2cdd66fd3b87273928 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <ashanbhag@nvidia.com>
Date: Fri, 19 Dec 2025 15:16:56 -0800
Subject: [PATCH 24/25] [None][fix] enable KV cache reuse for config database
 (#10094)

---
 examples/configs/curated/qwen3-next.yaml      |  2 +-
 .../B200/1k1k_tp8_conc16.yaml                 |  1 -
 .../B200/1k1k_tp8_conc32.yaml                 |  1 -
 .../DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml |  1 -
 .../B200/1k1k_tp8_conc64.yaml                 |  1 -
 .../DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml |  1 -
 .../B200/8k1k_tp8_conc16.yaml                 |  1 -
 .../B200/8k1k_tp8_conc32.yaml                 |  1 -
 .../DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml |  1 -
 .../B200/8k1k_tp8_conc64.yaml                 |  1 -
 .../DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml |  1 -
 .../H200/1k1k_tp8_conc16.yaml                 |  1 -
 .../H200/1k1k_tp8_conc32.yaml                 |  1 -
 .../DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml |  1 -
 .../H200/1k1k_tp8_conc64.yaml                 |  1 -
 .../DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml |  1 -
 .../H200/8k1k_tp8_conc16.yaml                 |  1 -
 .../H200/8k1k_tp8_conc32.yaml                 |  1 -
 .../DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml |  1 -
 .../H200/8k1k_tp8_conc64.yaml                 |  1 -
 .../DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml |  1 -
 .../B200/1k1k_tp4_conc128.yaml                |  1 -
 .../B200/1k1k_tp4_conc16.yaml                 |  1 -
 .../B200/1k1k_tp4_conc256.yaml                |  1 -
 .../B200/1k1k_tp4_conc32.yaml                 |  1 -
 .../B200/1k1k_tp4_conc4.yaml                  |  1 -
 .../B200/1k1k_tp4_conc64.yaml                 |  1 -
 .../B200/1k1k_tp4_conc8.yaml                  |  1 -
 .../B200/1k1k_tp8_conc128.yaml                |  1 -
 .../B200/1k1k_tp8_conc16.yaml                 |  1 -
 .../B200/1k1k_tp8_conc256.yaml                |  1 -
 .../B200/1k1k_tp8_conc32.yaml                 |  1 -
 .../B200/1k1k_tp8_conc4.yaml                  |  1 -
 .../B200/1k1k_tp8_conc64.yaml                 |  1 -
 .../B200/1k1k_tp8_conc8.yaml                  |  1 -
 .../B200/8k1k_tp4_conc128.yaml                |  1 -
 .../B200/8k1k_tp4_conc16.yaml                 |  1 -
 .../B200/8k1k_tp4_conc256.yaml                |  1 -
 .../B200/8k1k_tp4_conc32.yaml                 |  1 -
 .../B200/8k1k_tp4_conc4.yaml                  |  1 -
 .../B200/8k1k_tp4_conc64.yaml                 |  1 -
 .../B200/8k1k_tp4_conc8.yaml                  |  1 -
 .../B200/8k1k_tp8_conc128.yaml                |  1 -
 .../B200/8k1k_tp8_conc16.yaml                 |  1 -
 .../B200/8k1k_tp8_conc256.yaml                |  1 -
 .../B200/8k1k_tp8_conc32.yaml                 |  1 -
 .../B200/8k1k_tp8_conc4.yaml                  |  1 -
 .../B200/8k1k_tp8_conc64.yaml                 |  1 -
 .../B200/8k1k_tp8_conc8.yaml                  |  1 -
 .../gpt-oss-120b/B200/1k1k_tp1_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp1_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp1_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k1k_tp1_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp1_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/1k1k_tp2_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp2_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp2_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k1k_tp2_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp2_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/1k1k_tp4_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp4_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp4_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k1k_tp4_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp4_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/1k1k_tp8_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp8_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp8_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k1k_tp8_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k1k_tp8_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp1_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp1_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp1_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp1_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp1_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp2_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp2_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp2_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp2_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp2_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp4_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp4_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp4_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp4_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp4_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp8_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp8_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp8_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/1k8k_tp8_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/1k8k_tp8_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp1_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp1_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp1_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp1_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp1_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp2_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp2_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp2_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp2_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp2_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp4_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp4_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp4_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp4_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp4_conc8.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp8_conc16.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp8_conc32.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp8_conc4.yaml     |  1 -
 .../gpt-oss-120b/B200/8k1k_tp8_conc64.yaml    |  1 -
 .../gpt-oss-120b/B200/8k1k_tp8_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp1_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp1_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp1_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp1_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp1_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp2_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp2_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp2_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp2_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp2_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp4_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp4_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp4_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp4_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp4_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp8_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp8_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp8_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k1k_tp8_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k1k_tp8_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp1_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp1_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp1_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp1_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp1_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp2_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp2_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp2_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp2_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp2_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp4_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp4_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp4_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp4_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp4_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp8_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp8_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp8_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/1k8k_tp8_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/1k8k_tp8_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp1_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp1_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp1_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp1_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp1_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp2_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp2_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp2_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp2_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp2_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp4_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp4_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp4_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp4_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp4_conc8.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp8_conc16.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp8_conc32.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp8_conc4.yaml     |  1 -
 .../gpt-oss-120b/H200/8k1k_tp8_conc64.yaml    |  1 -
 .../gpt-oss-120b/H200/8k1k_tp8_conc8.yaml     |  1 -
 .../perf-sanity/config_database_b200_nvl.yaml | 72 +++++++++----------
 .../perf-sanity/config_database_h200_sxm.yaml | 72 +++++++++----------
 tests/unittest/llmapi/test_config_database.py |  9 +++
 172 files changed, 82 insertions(+), 241 deletions(-)

diff --git a/examples/configs/curated/qwen3-next.yaml b/examples/configs/curated/qwen3-next.yaml
index b78921a6c2..b9aa4f1b63 100644
--- a/examples/configs/curated/qwen3-next.yaml
+++ b/examples/configs/curated/qwen3-next.yaml
@@ -13,4 +13,4 @@ stream_interval: 20
 num_postprocess_workers: 4
 kv_cache_config:
   enable_block_reuse: false
-  free_gpu_memory_fraction: 0.6
+  free_gpu_memory_fraction: 0.9
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml
index f770a6566e..1b4d4b6aaf 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml
index f770a6566e..1b4d4b6aaf 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml
index f770a6566e..1b4d4b6aaf 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml
index f770a6566e..1b4d4b6aaf 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml
index f770a6566e..1b4d4b6aaf 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml
index 6660bcea96..09ee0c6020 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml
index 6660bcea96..09ee0c6020 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml
index 6660bcea96..09ee0c6020 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml
index 919a028409..5fba0289d3 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml
index 6660bcea96..09ee0c6020 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: DEEPGEMM
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml
index 008da1df54..9f80dfd3bc 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml
index 008da1df54..9f80dfd3bc 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml
index 008da1df54..9f80dfd3bc 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml
index 008da1df54..9f80dfd3bc 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml
index 008da1df54..9f80dfd3bc 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml
index decbb1744a..b2ff0ba8e5 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml
index decbb1744a..b2ff0ba8e5 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml
index decbb1744a..b2ff0ba8e5 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml
index 363eebf521..2b54bf087b 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml
index decbb1744a..b2ff0ba8e5 100644
--- a/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml
+++ b/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.75
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml
index c61e3abc15..d86cb71568 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml
index fe58a6a32b..903449e0f3 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml
index 2a06d3978d..18fbe5eec9 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml
index fe58a6a32b..903449e0f3 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml
index fe58a6a32b..903449e0f3 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml
index fe58a6a32b..903449e0f3 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml
index fe58a6a32b..903449e0f3 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml
index a4a4fe28c7..696ad9bb03 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml
index 397565e15b..fc971bf09e 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml
index 686db04f1f..86707e68fb 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml
index 397565e15b..fc971bf09e 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml
index 397565e15b..fc971bf09e 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml
index 397565e15b..fc971bf09e 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml
index 397565e15b..fc971bf09e 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml
index ace419c0d8..6d88108502 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml
index a0f2de5fec..9b81917cfe 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml
index 3c812ea3e9..bff28a8fa5 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml
index a0f2de5fec..9b81917cfe 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml
index a0f2de5fec..9b81917cfe 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml
index 06f600c1cd..17ca6555ef 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml
index a0f2de5fec..9b81917cfe 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml
index 5334ed3cf5..cfff9caf5e 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml
index 382a3c9045..c2a9fb067f 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml
index 639fdde94a..f4488125d2 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml
index 382a3c9045..c2a9fb067f 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml
index 382a3c9045..c2a9fb067f 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml
index 930a625308..eb2fb249d1 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: CUTLASS
diff --git a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml
index 382a3c9045..c2a9fb067f 100644
--- a/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml
+++ b/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ print_iter_log: true
 kv_cache_config:
   dtype: fp8
   free_gpu_memory_fraction: 0.8
-  enable_block_reuse: false
 stream_interval: 10
 moe_config:
   backend: TRTLLM
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml
index 1d4df97010..323d5260e3 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml
index 7d65f54710..4e42c74c5d 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml
index ca850a7758..41a3501db8 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml
index 345b0e5013..ef8559949b 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml
index 5fa5e373d2..cf3f143581 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml
index 7b392ada8d..89f16c0834 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml
index e8212dd139..50a9a01eab 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml
index ab22a7baf6..9e38bf35d1 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml
index 3f82650480..828ad26582 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml
index b07960f33d..27a1f1b0a5 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml
index e078ea3d6d..d85b3d01a4 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml
index 15f5a3ca50..9ed994fa4b 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml
index cdbb40a3eb..f7d77d7296 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml
index c5854b6daf..8d4215121a 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml
index 0ac4431175..0c0101c832 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml
index a18faa2622..dda82a593e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml
index 4ce42b3ce8..c770096914 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml
index 966138c163..55e10c0d15 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml
index a322f0681d..f606123c32 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml
index 644d2dabb4..2d284fcf8c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml
index 31544aa9f4..b17f7425cc 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml
index ec0ea7b2ba..e559dcc0b8 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml
index 249b14723f..af72d4f494 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml
index 21de3414a8..3e3f969fc7 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml
index 315b1add42..aef4e1636b 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml
index 56e1b648bd..879c2fba70 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml
index 4e02fe671b..322f73709e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml
index 4bc360839a..84cac88c86 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml
index 584fb5ae1a..031a0fd744 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml
index 6ab46126d5..445cec1d82 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml
index ef539d3bef..4d9ce00dbc 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml
index 40dc752084..e790e0d980 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml
index 3e0f48e7e1..f72680460c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml
index 2e3721c712..ddceeb61fa 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml
index 098e7ec388..f08dca8cd1 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml
index 45d77f70bd..95cdf3a61f 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml
index 9436b07959..74d4e29f67 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml
index a2917bfd5b..59b9319ee8 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml
index 702d3bc00c..365f467d6e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml
index c0b90314c3..befde425ae 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml
index 31544aa9f4..b17f7425cc 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml
index ec0ea7b2ba..e559dcc0b8 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml
index 249b14723f..af72d4f494 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml
index 21de3414a8..3e3f969fc7 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml
index 315b1add42..aef4e1636b 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml
index 56e1b648bd..879c2fba70 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml
index 4e02fe671b..322f73709e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml
index 4bc360839a..84cac88c86 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml
index 584fb5ae1a..031a0fd744 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml
index 6ab46126d5..445cec1d82 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml
index ef539d3bef..4d9ce00dbc 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml
index 40dc752084..e790e0d980 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml
index 3e0f48e7e1..f72680460c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml
index 2e3721c712..ddceeb61fa 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml
index 098e7ec388..f08dca8cd1 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml
index 45d77f70bd..95cdf3a61f 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml
index 9436b07959..74d4e29f67 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml
index a2917bfd5b..59b9319ee8 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml
index 702d3bc00c..365f467d6e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml
index c0b90314c3..befde425ae 100644
--- a/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml
@@ -7,7 +7,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: fp8
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 print_iter_log: true
 stream_interval: 20
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml
index 2eea897e2f..6736070cc7 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml
index 1a0d44fb27..a6fec264e8 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml
index 82662456f0..c98b557f79 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml
index 57d8e2ada2..8144ce5b4d 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml
index 87e34788d7..45864a956e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml
index 57b4b87fc7..4762e57770 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml
index 0d796e4751..3a16da2d4a 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml
index f6c41d8bbd..2b789c4add 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml
index fdec025db8..94c40f2ac2 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml
index 8565e82e36..2b866099d8 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml
index 4773067517..ff543422a0 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml
index 5e0d27c5ea..95477ae879 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml
index 9b135c0a32..dca7e5a63a 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml
index 6874784b9f..1b2ad812ed 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml
index cc1d2d8ac9..0d1ee33fee 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml
index f7e46b17a3..142a9c07af 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml
index 1b1b874c3e..c42c0506e1 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml
index 28a7f3d17c..161ebf4cdb 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml
index 8036e74399..f7f9a1b76d 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml
index 12289904ed..0c9084d25c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml
index 7ccdc4ae11..466565e489 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml
index ea6a93ba64..4f0632784c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml
index a0149f2ab5..57f3c1547c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml
index 3ae56a300a..e61ad0b3de 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml
index c18bc3c758..53b5461cf4 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml
index e88b4e05fe..41d9a88b20 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml
index 95b8e20733..5bd3aa86c7 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml
index c35b691a81..90854c5fbd 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml
index ce0f7c2757..d16f3ecc95 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml
index 344166bc32..3617162742 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml
index 4f895199b1..bb3a41bc2f 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml
index ca549de3d2..1dd6379a4e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml
index b87044bbc0..e63805377e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml
index 9af104970e..52be12ec84 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml
index 7440c3fcb7..a80d8f26ab 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml
index b1d8a6eead..4c416c78bb 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml
index f8c7fec13a..eade986209 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml
index f9cb8feb69..24916df78a 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml
index a9124d7007..a609e99a88 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml
index 7c2507ace7..9c37e6359d 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml
index 7ccdc4ae11..466565e489 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml
index ea6a93ba64..4f0632784c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml
index a0149f2ab5..57f3c1547c 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml
index 3ae56a300a..e61ad0b3de 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml
index c18bc3c758..53b5461cf4 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml
index e88b4e05fe..41d9a88b20 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml
index 95b8e20733..5bd3aa86c7 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml
index c35b691a81..90854c5fbd 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml
index ce0f7c2757..d16f3ecc95 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml
index 344166bc32..3617162742 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml
index 4f895199b1..bb3a41bc2f 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml
index ca549de3d2..1dd6379a4e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml
index b87044bbc0..e63805377e 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml
index 9af104970e..52be12ec84 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml
index 7440c3fcb7..a80d8f26ab 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml
index b1d8a6eead..4c416c78bb 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml
index f8c7fec13a..eade986209 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml
index f9cb8feb69..24916df78a 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml
index a9124d7007..a609e99a88 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml
index 7c2507ace7..9c37e6359d 100644
--- a/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml
+++ b/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml
@@ -6,7 +6,6 @@ cuda_graph_config:
 enable_attention_dp: false
 kv_cache_config:
   dtype: auto
-  enable_block_reuse: false
   free_gpu_memory_fraction: 0.85
 moe_config:
   backend: TRITON
diff --git a/tests/scripts/perf-sanity/config_database_b200_nvl.yaml b/tests/scripts/perf-sanity/config_database_b200_nvl.yaml
index 3ad69455a4..d1e8783171 100644
--- a/tests/scripts/perf-sanity/config_database_b200_nvl.yaml
+++ b/tests/scripts/perf-sanity/config_database_b200_nvl.yaml
@@ -590,8 +590,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -625,8 +625,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -660,8 +660,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -695,8 +695,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -730,8 +730,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -765,8 +765,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -800,8 +800,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -835,8 +835,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -870,8 +870,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -905,8 +905,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -940,8 +940,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -975,8 +975,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1010,8 +1010,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1045,8 +1045,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1080,8 +1080,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1115,8 +1115,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1150,8 +1150,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1185,8 +1185,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1220,8 +1220,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1255,8 +1255,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1290,8 +1290,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1325,8 +1325,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1360,8 +1360,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1395,8 +1395,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1430,8 +1430,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1465,8 +1465,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1500,8 +1500,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1535,8 +1535,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1570,8 +1570,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1605,8 +1605,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1640,8 +1640,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1675,8 +1675,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1710,8 +1710,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1745,8 +1745,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1780,8 +1780,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
@@ -1815,8 +1815,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: fp8
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   print_iter_log: true
   stream_interval: 20
   num_postprocess_workers: 4
diff --git a/tests/scripts/perf-sanity/config_database_h200_sxm.yaml b/tests/scripts/perf-sanity/config_database_h200_sxm.yaml
index 9d2f8481ce..257805ee29 100644
--- a/tests/scripts/perf-sanity/config_database_h200_sxm.yaml
+++ b/tests/scripts/perf-sanity/config_database_h200_sxm.yaml
@@ -201,8 +201,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -235,8 +235,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -269,8 +269,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -303,8 +303,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -337,8 +337,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -371,8 +371,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -405,8 +405,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -439,8 +439,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -473,8 +473,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -507,8 +507,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -541,8 +541,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -575,8 +575,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -609,8 +609,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -643,8 +643,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -677,8 +677,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -711,8 +711,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -745,8 +745,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -779,8 +779,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -813,8 +813,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -847,8 +847,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -881,8 +881,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -915,8 +915,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -949,8 +949,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -983,8 +983,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1017,8 +1017,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1051,8 +1051,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1085,8 +1085,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1119,8 +1119,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1153,8 +1153,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1187,8 +1187,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1221,8 +1221,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1255,8 +1255,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1289,8 +1289,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1323,8 +1323,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1357,8 +1357,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
@@ -1391,8 +1391,8 @@ server_configs:
   enable_attention_dp: false
   kv_cache_config:
     dtype: auto
-    enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
+    enable_block_reuse: false
   moe_config:
     backend: TRITON
   num_postprocess_workers: 4
diff --git a/tests/unittest/llmapi/test_config_database.py b/tests/unittest/llmapi/test_config_database.py
index 72dfce770f..4084481ef2 100644
--- a/tests/unittest/llmapi/test_config_database.py
+++ b/tests/unittest/llmapi/test_config_database.py
@@ -59,6 +59,15 @@ def test_config_validates_against_llm_args(config_path: Path):
     TorchLlmArgs(**merged)
 
 
+@pytest.mark.part0
+@pytest.mark.parametrize("config_path", DATABASE_CONFIGS, ids=get_config_id)
+def test_config_does_not_disable_kv_cache_block_reuse(config_path: Path):
+    with open(config_path) as f:
+        config_dict = yaml.safe_load(f) or {}
+
+    assert config_dict.get("kv_cache_config", {}).get("enable_block_reuse") is not False
+
+
 @pytest.mark.part0
 def test_database_config_count():
     assert len(DATABASE_CONFIGS) > 0, "No database config files found"

From e75331480f84b53d3cbc0e92a0201bacfd16a189 Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Sat, 20 Dec 2025 09:04:48 +0800
Subject: [PATCH 25/25] [None][fix] fix draft_lengths for CUDA graph capture.
 (#10004)

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 .../_torch/pyexecutor/model_engine.py         | 24 +++----------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 81c65288f8..9a7066be8b 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -711,6 +711,7 @@ class PyTorchModelEngine(ModelEngine):
             else:
                 draft_lengths.append(self.max_total_draft_tokens)
         else:
+            draft_lengths.append(self.max_total_draft_tokens)
             # For non-draft model, we also capture the CUDA graph instance for draft length 0,
             # so that when we disable spec decode at runtime, we can still run the captured graph.
             # Note that for one engine mode, we are not able to turn off spec decode at runtime.
@@ -720,7 +721,8 @@ class PyTorchModelEngine(ModelEngine):
                     # value. This will save on memory.
                     and self.spec_config.max_concurrency is not None):
                 draft_lengths.append(0)
-            draft_lengths = [self.max_total_draft_tokens]
+        # Reverse order so smaller graphs can reuse memory from larger ones
+        draft_lengths = sorted(set(draft_lengths), reverse=True)
 
         # Create CUDA graphs for short and long sequences separately for sparse attention.
         sparse_config = self.sparse_attention_config
@@ -970,26 +972,6 @@ class PyTorchModelEngine(ModelEngine):
                 request_ids=list(range(batch_size)))
         return result
 
-    def _get_cuda_graph_draft_lengths(
-            self, resource_manager: ResourceManager) -> List[int]:
-        """Determines the draft lengths for which to capture CUDA graphs."""
-        draft_lengths = [self.max_total_draft_tokens]
-        spec_resource_manager = resource_manager.get_resource_manager(
-            ResourceManagerType.SPEC_RESOURCE_MANAGER)
-
-        # For non-draft model, also capture a graph for draft_len=0
-        if (not self.is_draft_model and self.max_draft_len > 0
-                and not self.spec_config.spec_dec_mode.use_one_engine()
-                and self.spec_config.max_concurrency is not None):
-            draft_lengths.append(0)
-
-        # Special case for Eagle3 draft model
-        if (self.is_spec_decode and self.is_draft_model
-                and isinstance(spec_resource_manager, Eagle3ResourceManager)):
-            draft_lengths.append(self.original_max_draft_len)
-
-        return list(set(draft_lengths))  # Use set to remove duplicates
-
     def _update_draft_inference_state_for_warmup(
             self, batch: ScheduledRequests, is_first_draft: bool,
             resource_manager: ResourceManager):