Merge branch 'main' into update_mnnvl_test

2026-02-13 14:33:52 +08:00 · 2025-12-17 01:57:20 +08:00 · 2025-12-17 01:57:20 +08:00 · 6b6daf31a8
commit 6b6daf31a8
parent e72b9f9377 bd13957e70
3346 changed files with 66356 additions and 24893 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_context_wait_performance.png  filter=lfs diff=lfs merge=lfs -text
+cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
+cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/waiting_for_feedback.yml
+++ b/.github/workflows/waiting_for_feedback.yml
@ -0,0 +1,127 @@
+name: Manage Waiting for Feedback Label
+
+on:
+  issue_comment:
+    types: [created]
+  pull_request_review_comment:
+    types: [created]
+
+permissions:
+  issues: write
+  pull-requests: write
+
+jobs:
+  manage-waiting-for-feedback:
+    runs-on: ubuntu-latest
+    if: github.repository == 'NVIDIA/TensorRT-LLM'
+    steps:
+      - name: Check membership and manage label
+        uses: actions/github-script@v8
+        with:
+          script: |
+            const commenter = context.payload.comment.user.login;
+            const commenterType = context.payload.comment.user.type;
+            const label = 'waiting for feedback';
+
+            // Ignore bots and CI accounts
+            const ignoredAccounts = ['tensorrt-cicd'];
+            if (commenterType === 'Bot' || ignoredAccounts.includes(commenter)) {
+              console.log(`Ignoring comment from ${commenter} (type: ${commenterType}). Skipping.`);
+              return;
+            }
+
+            // Handle both issue_comment and pull_request_review_comment events
+            // context.issue.number is only available for issue_comment events
+            const issueNumber = context.issue?.number || context.payload.pull_request?.number;
+            const issue = context.payload.issue || context.payload.pull_request;
+            const author = issue?.user?.login;
+            const isAuthor = (commenter === author);
+
+            if (!issueNumber) {
+              console.log('Could not determine issue/PR number. Skipping.');
+              return;
+            }
+
+            console.log(`Comment by ${commenter} on #${issueNumber} (author: ${author})`);
+            const owner = context.repo.owner;
+            const repo = context.repo.repo;
+
+            // Check if commenter is repository member
+            let isMember = false;
+            try {
+              await github.rest.repos.checkCollaborator({
+                owner,
+                repo,
+                username: commenter
+              });
+              isMember = true;
+            } catch (error) {
+              if (error.status === 404) {
+                isMember = false;
+              } else if (error.status === 302) {
+                console.log(`Cannot determine membership for ${commenter} (insufficient token permissions)`);
+                return;
+              } else {
+                console.error(`Error checking membership: ${error.message}`);
+                throw error;
+              }
+            }
+
+            // Logic:
+            // - Author responds → remove label (feedback provided)
+            // - NVIDIA non-author comments → add label (team is waiting for response)
+            // - External non-author comments → remove label (someone provided feedback)
+
+            if (isAuthor) {
+              // Author responded - remove 'waiting for feedback' label
+              console.log(`${commenter} is the author. Removing '${label}' label if present.`);
+
+              try {
+                await github.rest.issues.removeLabel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber,
+                  name: label
+                });
+                console.log(`Successfully removed '${label}' label from #${issueNumber}`);
+              } catch (error) {
+                if (error.status === 404) {
+                  console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`);
+                } else {
+                  throw error;
+                }
+              }
+
+            } else if (isMember) {
+              // NVIDIA non-author commented - add 'waiting for feedback' label
+              console.log(`${commenter} is an NVIDIA member (not author). Adding '${label}' label.`);
+
+              await github.rest.issues.addLabels({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: issueNumber,
+                labels: [label]
+              });
+
+              console.log(`Successfully added '${label}' label to #${issueNumber}`);
+
+            } else {
+              // External non-author commented - remove 'waiting for feedback' label
+              console.log(`${commenter} is external (not author). Removing '${label}' label if present.`);
+
+              try {
+                await github.rest.issues.removeLabel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber,
+                  name: label
+                });
+                console.log(`Successfully removed '${label}' label from #${issueNumber}`);
+              } catch (error) {
+                if (error.status === 404) {
+                  console.log(`Label '${label}' was not present on #${issueNumber}. No action needed.`);
+                } else {
+                  throw error;
+                }
+              }
+            }
--- a/.gitignore
+++ b/.gitignore
@ -55,6 +55,8 @@ tensorrt_llm/scripts
 *docs/source/_cpp_gen*
 docs/source/**/*.rst
 !docs/source/examples/index.rst
+!docs/source/deployment-guide/config_table.rst
+!docs/source/deployment-guide/note_sections.rst
 *.swp

 # Testing
@ -72,6 +74,7 @@ llm-test-workspace/
 cpp/include/tensorrt_llm/executor/version.h
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
+cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
 .devcontainer/.env
 /examples/layer_wise_benchmarks/profiles/

@ -86,3 +89,6 @@ compile_commands.json
 # Enroot sqsh files
 enroot/sw-tensorrt-docker+*.sqsh
 enroot/tensorrt_llm.devel.sqsh
+
+# MacOSX Files
+.DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1395,6 +1395,8 @@ repos:
    -   id: check-symlinks
    -   id: detect-private-key
    -   id: end-of-file-fixer
+        exclude: |
+            (?x)^(.*cubin.cpp | .*cubin.h)$
    -   id: check-yaml
        args: [--allow-multiple-documents]
        exclude: ".*/gitlab/.*.yml"
@ -1439,7 +1441,7 @@ repos:
        additional_dependencies:
        - tomli
        # add ignore words list
-        args: ["-L", "Mor,ans,thirdparty", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"]
+        args: ["-L", "Mor,ans,thirdparty,subtiles", "--skip", "ATTRIBUTIONS-*.md,*.svg", "--skip", "security_scanning/*"]
 -   repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.9.4
    hooks:
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -8,33 +8,9 @@

 ## Coding Guidelines

-* Coding style for TensorRT-LLM can be found [in this document](CODING_GUIDELINES.md).
+TensorRT-LLM Coding Style can be found [in this document](CODING_GUIDELINES.md).

-* All contributed C++ code should be formatted following the rules in TensorRT-LLM's [clang-format](.clang-format) file. The recommended version is clang-format>=14.0.
-
-* Changes can be formatted with the following command:
-
-  ```bash
-  # Commit ID is optional - if unspecified, run format on staged changes.
-  git-clang-format --style file [commit ID/reference]
-  ```
-
-* All contributed Python code should be formatted using the `black` Python package. The recommended version is `black>=23.0`
-
-* Changes can be formatted with the following command:
-
-  ```bash
-  git diff --name-only | grep "*.py" | xargs black -l 120
-  ```
-
-* Try to keep pull requests (PRs) as concise as possible:
-  * Avoid committing commented-out code.
-  * Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
-
-## Coding Style
-
-We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local
-Python environment.
+We use `pre-commit` for automatic code formatting and validation. Install the `pre-commit` package in your local Python environment.

 ```bash
 pip install pre-commit
@ -73,6 +49,9 @@ mdformat.................................................................Passed

 If any files were modified by this hook, you will need to stage and commit them again.

+In addition, please try to keep pull requests (PRs) as concise as possible:
+* Avoid committing commented-out code.
+* Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.

 ## Pull Requests

--- a/134
+++ b/134
@ -1,7 +1,84 @@
-Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

-Portions of this project are under the following copyright:
- Copyright contributors to the vLLM project
+This project is licensed under the Apache 2.0 license, whose full license text is available below.
+
+This project contains portions of code that are based on or derived from
+other open source projects, which may have different licenses whose text
+is available below.
+
+All modifications and additions to other projects are licensed under the
+Apache License 2.0 unless otherwise specified. Please refer to the individual
+file headers for specific copyright and license information.
+
+Below is a list of other projects that have portions contained by this project:
+
+--------------------------------------------------------------------------------
+causal-conv1d
+--------------------------------------------------------------------------------
+Original Source: https://github.com/Dao-AILab/causal-conv1d
+Copyright (c) 2024, Tri Dao.
+Licensed under the BSD 3-Clause License
+
+--------------------------------------------------------------------------------
+flash-linear-attention
+--------------------------------------------------------------------------------
+Original Source: https://github.com/fla-org/flash-linear-attention
+Copyright (c) 2023-2025 Songlin Yang
+Licensed under the MIT License
+
+--------------------------------------------------------------------------------
+InstructEval
+--------------------------------------------------------------------------------
+Original Source: https://github.com/declare-lab/instruct-eval
+Copyright (c) 2020 Dan Hendrycks
+Copyright (c) 2023 Deep Cognition and Language Research (DeCLaRe) Lab
+Licensed under the MIT License
+
+--------------------------------------------------------------------------------
+Mamba
+--------------------------------------------------------------------------------
+Original Source: https://github.com/state-spaces/mamba
+Copyright 2023 Tri Dao, Albert Gu
+Licensed under the Apache License 2.0
+
+--------------------------------------------------------------------------------
+SGLang
+--------------------------------------------------------------------------------
+Original Source: https://github.com/sgl-project/sglang
+Copyright contributors to the SGLang project
+Licensed under the Apache License 2.0
+
+--------------------------------------------------------------------------------
+Text Generation Inference
+--------------------------------------------------------------------------------
+Original Source: https://github.com/huggingface/text-generation-inference
+Copyright 2022 Hugging Face
+Licensed under the Apache License 2.0
+
+--------------------------------------------------------------------------------
+Transformers
+--------------------------------------------------------------------------------
+Original Source: https://github.com/huggingface/transformers
+Copyright 2018 The HuggingFace Team
+Licensed under the Apache License 2.0
+
+--------------------------------------------------------------------------------
+XGrammar
+--------------------------------------------------------------------------------
+Original Source: https://github.com/mlc-ai/xgrammar
+Copyright (c) 2024 by XGrammar Contributors
+Licensed under the Apache License 2.0
+
+--------------------------------------------------------------------------------
+vLLM
+--------------------------------------------------------------------------------
+Original Source: https://github.com/vllm-project/vllm
+Copyright contributors to the vLLM project
+Licensed under the Apache License 2.0
+
+================================================================================
+                              Apache 2.0 LICENSE
+================================================================================

                                 Apache License
                           Version 2.0, January 2004
@ -204,3 +281,54 @@ Portions of this project are under the following copyright:
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
+
+================================================================================
+                              MIT LICENSE
+================================================================================
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+================================================================================
+                              BSD 3-Clause License
+================================================================================
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.9.0-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.2.0rc5-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.2.0rc6-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)

 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
--- a/benchmarks/cpp/prepare_dataset.py
+++ b/benchmarks/cpp/prepare_dataset.py
@ -49,7 +49,7 @@ class RootArgs(BaseModel):
        return self


-@click.group()
+@click.group(deprecated=True)
@click.option(
    "--tokenizer",
    required=True,
--- a/benchmarks/cpp/utils/utils.cpp
+++ b/benchmarks/cpp/utils/utils.cpp
@ -1,6 +1,7 @@

 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
+ *AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -17,13 +18,16 @@
 */

 #include "utils.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/logger.h"
 #include <random>

 #include <filesystem>
 #include <fstream>

-namespace tensorrt_llm::benchmark
+TRTLLM_NAMESPACE_BEGIN
+
+namespace benchmark
 {

 std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
@ -98,7 +102,8 @@ Samples parseWorkloadJson(
    if (samples.size() < maxNumSamples)
    {
        TLLM_LOG_WARNING(
-            "Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
+            "Dataset size %zu is smaller than given max_num_samples "
+            "%d, max_num_samples will be ignored.\n",
            samples.size(), maxNumSamples);
    }
    return samples;
@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
    return os;
 }

-} // namespace tensorrt_llm::benchmark
+} // namespace benchmark
+
+TRTLLM_NAMESPACE_END
--- a/benchmarks/cpp/utils/utils.h
+++ b/benchmarks/cpp/utils/utils.h
@ -16,6 +16,7 @@
 * limitations under the License.
 */

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/executor/executor.h"

 #include <cstdint>
@ -29,7 +30,9 @@

 #pragma once

-namespace tensorrt_llm::benchmark
+TRTLLM_NAMESPACE_BEGIN
+
+namespace benchmark
 {

 // using namespace tensorrt_llm::batch_manager;
@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int

 std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);

-} // namespace tensorrt_llm::benchmark
+} // namespace benchmark
+
+TRTLLM_NAMESPACE_END
--- a/constraints.txt
+++ b/constraints.txt
@ -1,2 +1,5 @@
 # These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
 # is updated.
+# WAR against https://github.com/advisories/GHSA-gm62-xv2j-4w53
+# WAR against https://github.com/advisories/GHSA-2xpw-w6gg-jr37
+urllib3>=2.6.0
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@ -78,9 +78,7 @@ using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
 using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
 using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
 using CacheSaltIDType = tensorrt_llm::runtime::CacheSaltIDType;
-
-// Type alias for multimodal hash key (hash array + start offset)
-using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>;
+using MmKey = tensorrt_llm::executor::MmKey;

 template <typename T>
 using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
@ -325,6 +323,8 @@ public:

    size_t getHash() const;

+    std::vector<MmKey> getExtraKeys() const;
+
 private:
    // Linear ID of block independent of pool
    IdType mBlockId;
--- a/cpp/include/tensorrt_llm/common/algorithm.h
+++ b/cpp/include/tensorrt_llm/common/algorithm.h
@ -16,8 +16,9 @@

 #pragma once

-namespace tensorrt_llm
-{
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN

 // Base class for algorithms
 struct Algorithm
@ -29,4 +30,4 @@ struct Algorithm
    Algorithm& operator=(Algorithm const&) = delete;
 };

-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/arrayView.h
+++ b/cpp/include/tensorrt_llm/common/arrayView.h
@ -17,9 +17,13 @@
 #pragma once

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
+
 #include <cstdint>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 //!
@ -100,4 +104,6 @@ private:
    size_type mSize;
 };

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/assert.h
+++ b/cpp/include/tensorrt_llm/common/assert.h
@ -16,14 +16,19 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/tllmException.h"

+TRTLLM_NAMESPACE_BEGIN
+
 class DebugConfig
 {
 public:
    static bool isCheckDebugEnabled();
 };

+TRTLLM_NAMESPACE_END
+
 #if defined(_WIN32)
 #define TLLM_LIKELY(x) (__assume((x) == 1), (x))
 #define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
@ -35,8 +40,8 @@ public:
 #define TLLM_CHECK(val)                                                                                                \
    do                                                                                                                 \
    {                                                                                                                  \
-        TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                               \
-                                            : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);       \
+        TLLM_LIKELY(static_cast<bool>(val))                                                                            \
+        ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);                              \
    } while (0)

 #define TLLM_CHECK_WITH_INFO(val, info, ...)                                                                           \
@ -51,17 +56,17 @@ public:
 #define TLLM_CHECK_DEBUG(val)                                                                                          \
    do                                                                                                                 \
    {                                                                                                                  \
-        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled()))                                           \
        {                                                                                                              \
-            TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                           \
-                                                : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);   \
+            TLLM_LIKELY(static_cast<bool>(val))                                                                        \
+            ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);                          \
        }                                                                                                              \
    } while (0)

 #define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...)                                                                     \
    do                                                                                                                 \
    {                                                                                                                  \
-        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled()))                                           \
        {                                                                                                              \
            TLLM_LIKELY(static_cast<bool>(val))                                                                        \
            ? ((void) 0)                                                                                               \
--- a/cpp/include/tensorrt_llm/common/bindingUtils.h
+++ b/cpp/include/tensorrt_llm/common/bindingUtils.h
@ -17,9 +17,13 @@
 #pragma once

 #include "c10/util/intrusive_ptr.h"
+#include "tensorrt_llm/common/config.h"
+
 #include <Python.h>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 // Adapted from pybind11's example implementation:
@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
    return c10::intrusive_ptr<T>::reclaim_copy(p);
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/config.h
+++ b/cpp/include/tensorrt_llm/common/config.h
@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#ifndef TRTLLM_CONFIG_H
+#define TRTLLM_CONFIG_H
+
+/**
+ * \def TRTLLM_ABI_NAMESPACE
+ * This macro is used to open an implicitly inline namespace block for the ABI version.
+ * This macro can be overridden to change the ABI version.
+ * The default ABI version is _v1.
+ */
+#ifndef TRTLLM_ABI_NAMESPACE
+#define TRTLLM_ABI_NAMESPACE _v1
+#endif
+
+#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
+#define TRTLLM_ABI_NAMESPACE_BEGIN                                                                                     \
+    inline namespace TRTLLM_ABI_NAMESPACE                                                                              \
+    {
+#endif
+
+#ifndef TRTLLM_ABI_NAMESPACE_END
+#define TRTLLM_ABI_NAMESPACE_END }
+#endif
+
+/**
+ * \def TRTLLM_NAMESPACE_BEGIN
+ * This macro is used to open a `tensorrt_llm::` namespace block, along with any
+ * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by TensorRT-LLM and may not be overridden.
+ */
+#define TRTLLM_NAMESPACE_BEGIN                                                                                         \
+    namespace tensorrt_llm                                                                                             \
+    {                                                                                                                  \
+    TRTLLM_ABI_NAMESPACE_BEGIN
+
+/**
+ * \def TRTLLM_NAMESPACE_END
+ * This macro is used to close a `tensorrt_llm::` namespace block, along with any
+ * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by TensorRT-LLM and may not be overridden.
+ */
+#define TRTLLM_NAMESPACE_END                                                                                           \
+    TRTLLM_ABI_NAMESPACE_END                                                                                           \
+    }  /* end namespace tensorrt_llm */
+
+#endif // TRTLLM_CONFIG_H
--- a/cpp/include/tensorrt_llm/common/cudaFp8Utils.h
+++ b/cpp/include/tensorrt_llm/common/cudaFp8Utils.h
@ -16,6 +16,8 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
+
 #ifdef ENABLE_FP8
 #include <cuda_fp8.h>
 #include <cuda_runtime.h>
@ -29,8 +31,8 @@
 #define USE_QGMMA
 #endif

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
    const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
 #endif // ENABLE_FP8
--- a/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h
+++ b/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h
@ -14,12 +14,18 @@
 * limitations under the License.
 */

+#pragma once
+
+#include "tensorrt_llm/common/config.h"
+
 #include <cstdint>
 #include <optional>
 #include <string>
 #include <unordered_set>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 /// @brief Populate the start and end profiling iteration indexes from the provided environment variables
@ -28,4 +34,6 @@ namespace tensorrt_llm::common
 std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
    std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt);

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/cudaUtils.h
+++ b/cpp/include/tensorrt_llm/common/cudaUtils.h
@ -16,6 +16,7 @@
 */
 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
@ -49,7 +50,9 @@
               // this undef.
 #endif         // WIN32

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 // workspace for cublas gemm : 32MB
@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq)
 DEFINE_MEMBER_CHECKER(qua)
 DEFINE_MEMBER_CHECKER(high_preciecion_normed_output)

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END

 /*
 * Macros compliant with TensorRT coding conventions
--- a/cpp/include/tensorrt_llm/common/dataType.h
+++ b/cpp/include/tensorrt_llm/common/dataType.h
@ -16,11 +16,15 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/tllmException.h"
+
 #include <NvInferRuntime.h>
 #include <map>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 constexpr static size_t getDTypeSize(nvinfer1::DataType type)
@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type)
    return "";
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/logger.h
+++ b/cpp/include/tensorrt_llm/common/logger.h
@ -22,9 +22,12 @@
 #include <string>

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 class Logger
@ -125,12 +128,12 @@ private:

    static inline std::string getPrefix(Level const level)
    {
-        return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
+        return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
    }

    static inline std::string getPrefix(Level const level, int const rank)
    {
-        return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
+        return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
    }
 };

@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
        out << std::endl;
    }
 }
+} // namespace common
+
+TRTLLM_NAMESPACE_END

 #define TLLM_LOG(level, ...)                                                                                           \
    do                                                                                                                 \
@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
 #define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
 #define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
 #define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
-} // namespace tensorrt_llm::common
--- a/cpp/include/tensorrt_llm/common/optionalRef.h
+++ b/cpp/include/tensorrt_llm/common/optionalRef.h
@ -16,11 +16,15 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
+
 #include <functional>
 #include <memory>
 #include <optional>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 /**
@ -100,4 +104,6 @@ public:
    }
 };

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/quantization.h
+++ b/cpp/include/tensorrt_llm/common/quantization.h
@ -16,12 +16,14 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
+
 #include <cstdint>
 #include <optional>
 #include <string>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -480,4 +482,5 @@ public:
 };

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/stringUtils.h
+++ b/cpp/include/tensorrt_llm/common/stringUtils.h
@ -16,6 +16,7 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #if ENABLE_BF16
 #include <cuda_bf16.h>
 #endif // ENABLE_BF16
@ -28,7 +29,9 @@
 #include <unordered_set>
 #include <vector>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {
 #if ENABLE_BF16
 static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
@ -228,4 +231,6 @@ inline void toUpper(std::string& s)
    }
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/tllmException.h
+++ b/cpp/include/tensorrt_llm/common/tllmException.h
@ -16,6 +16,7 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"

 #include <array>
@ -41,7 +42,9 @@
    tensorrt_llm::common::RequestSpecificException(                                                                    \
        __FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 /// @brief Enumeration of different error codes for request-specific exceptions
@ -77,7 +80,8 @@ private:

 [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info)
 {
-    throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
+    throw TllmException(
+        file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
 }

 [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
@ -102,4 +106,6 @@ private:
    RequestErrorCode mErrorCode;
 };

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/utils.h
+++ b/cpp/include/tensorrt_llm/common/utils.h
@ -16,6 +16,8 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
+
 #include <algorithm>
 #include <initializer_list>
 #include <string>
@ -24,7 +26,9 @@
 #include <pthread.h>
 #endif

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 inline bool setThreadName(std::string const& name)
@ -43,4 +47,6 @@ bool contains(std::initializer_list<T> const& c, T const& v)
    return std::find(c.begin(), c.end(), v) != c.end();
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/executor/cacheCommunicator.h
+++ b/cpp/include/tensorrt_llm/executor/cacheCommunicator.h
@ -66,6 +66,7 @@ public:
    [[nodiscard]] virtual std::vector<Connection const*> getConnections(CommState const& state) = 0;

    [[nodiscard]] virtual CommState const& getCommState() const = 0;
+    [[nodiscard]] virtual bool isRunning() const = 0;
 };

 } // namespace tensorrt_llm::executor::kv_cache
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@ -47,6 +47,12 @@ class BaseKVCacheManager;
 namespace tensorrt_llm::executor
 {

+using SizeType32 = tensorrt_llm::runtime::SizeType32;
+// Mmkey is used in KVCacheBlock when multimodal data presents in a block.
+// Type alias for hash array + start offset at per-block granularity.
+// This differs from the per-request level multimodal hash in MultimodalInput.
+using MmKey = std::pair<std::array<uint8_t, 32>, SizeType32>;
+
 /// @brief Version of TRT-LLM
 char const* version() noexcept;

@ -1691,12 +1697,14 @@ struct KVCacheStoredBlockData
 {

    KVCacheStoredBlockData(IdType blockHash, tensorrt_llm::runtime::VecUniqueTokens tokens,
-        std::optional<tensorrt_llm::runtime::LoraTaskIdType> loraId, SizeType32 cacheLevel, SizeType32 priority)
+        std::optional<tensorrt_llm::runtime::LoraTaskIdType> loraId, SizeType32 cacheLevel, SizeType32 priority,
+        std::vector<MmKey> mmKeys = {})
        : blockHash{blockHash}
        , tokens{std::move(tokens)}
        , loraId{loraId}
        , cacheLevel{cacheLevel}
        , priority{priority}
+        , mmKeys{std::move(mmKeys)}
    {
    }

@ -1710,6 +1718,8 @@ struct KVCacheStoredBlockData
    SizeType32 cacheLevel;
    /// @brief The priority of the block
    SizeType32 priority;
+    /// @brief The multimodal keys of the block
+    std::vector<MmKey> mmKeys;
 };

 struct KVCacheStoredData
--- a/cpp/include/tensorrt_llm/kernels/archCondition.h
+++ b/cpp/include/tensorrt_llm/kernels/archCondition.h
@ -16,7 +16,11 @@

 #pragma once

-namespace tensorrt_llm::kernels
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels
 {

 namespace detail
@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible<Arch>::value;

 } // namespace arch

-} // namespace tensorrt_llm::kernels
+} // namespace kernels
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/kernels/decodingCommon.h
+++ b/cpp/include/tensorrt_llm/kernels/decodingCommon.h
@ -17,11 +17,14 @@
 #pragma once

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/executor/types.h"
 #include <cstdint>
 #include <curand_kernel.h>

-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels
 {

 class FinishedState
@ -308,4 +311,6 @@ template <typename T>
 void invokeScatterDecodingParams(
    T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream);

-} // namespace tensorrt_llm::kernels
+} // namespace kernels
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h
+++ b/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h
@ -17,11 +17,14 @@
 #pragma once

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"

 #include <cstdint>
 #include <cuda_runtime.h>

-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels
 {

 class KVCacheIndex
@ -53,4 +56,6 @@ private:
    UnderlyingType value;
 };

-} // namespace tensorrt_llm::kernels
+} // namespace kernels
+
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h
+++ b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h
@ -14,16 +14,18 @@
 * limitations under the License.
 */

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/runtime/iBuffer.h"

 using namespace tensorrt_llm::runtime;

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace kernels
 {
 void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
    cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/kernels/fmha_v2/Makefile
+++ b/cpp/kernels/fmha_v2/Makefile
@ -1,18 +1,18 @@
 # ##################################################################################################
-#  Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
-#  Redistribution and use in source and binary forms, with or without modification, are not permit-
-#  ted.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-#  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-#  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-#  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-#  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT;
-#  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-#  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# http://www.apache.org/licenses/LICENSE-2.0
 #
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # ##################################################################################################

 # #################################################################################################
--- a/cpp/kernels/fmha_v2/setup.py
+++ b/cpp/kernels/fmha_v2/setup.py
@ -200,38 +200,22 @@ ns_close = r"""

 copyright = '''\
 /***************************************************************************************************
- * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
 *
- * Redistribution and use in source and binary forms, with or without modification, are not permit-
- * ted.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
 *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * http://www.apache.org/licenses/LICENSE-2.0
 *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
 **************************************************************************************************/
-''' if not generate_cu_trtllm else r"""/*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
-* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-"""
+'''

 makefile_template = '''\

@ -2175,7 +2159,8 @@ def get_kernel_code(kspec, kname, lname):
    params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params'
    attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;'
    bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;'
-    include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else ''
+    include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else ''
+    include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else ''
    num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;'
    fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}'
    const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}'
@ -2201,8 +2186,19 @@ def get_kernel_code(kspec, kname, lname):
        const int COMPUTE_REG_COUNT = {compute_reg_count};
        asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format(
        compute_reg_count=compute_reg_count)
-    local_ns_open = ns_open if generate_cu_trtllm else ''
-    local_ns_close = ns_close if generate_cu_trtllm else ''
+    abi_ns_open = r"""
+TRTLLM_NAMESPACE_BEGIN
+namespace kernels
+{
+// clang-format off
+"""
+    abi_ns_close = r"""
+// clang-format on
+} // namespace kernels
+TRTLLM_NAMESPACE_END
+"""
+    local_ns_open = abi_ns_open if generate_cu_trtllm else ''
+    local_ns_close = abi_ns_close if generate_cu_trtllm else ''

    tmp = dict(locals(), **kspec._asdict())

@ -3077,8 +3073,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None):
 def get_cubin_header(kernel_traits, specs_names):
    cubins = []
    cubin_lens = []
+    launchers = []
    cubins_dict = {}
    cubin_lens_dict = {}
+    launchers_dict = {}
    for kspec, fname, lname, kname in specs_names:
        if generate_cu_trtllm and not use_cubin_header(
                kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
@ -3282,11 +3280,11 @@ def get_cubin_header(kernel_traits, specs_names):
            if generate_cu_trtllm and lname != 'nullptr':
                launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format(
                    lname=lname)
-                if int(sm) in cubins_dict:
-                    if launcher not in cubins_dict[int(sm)]:
-                        cubins_dict[int(sm)].append(launcher)
+                if int(sm) in launchers_dict:
+                    if launcher not in launchers_dict[int(sm)]:
+                        launchers_dict[int(sm)].append(launcher)
                else:
-                    cubins_dict[int(sm)] = [launcher]
+                    launchers_dict[int(sm)] = [launcher]
        elif 'mhca' in kname:
            code = '''\
 {{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm},  {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
@ -3309,17 +3307,33 @@ def get_cubin_header(kernel_traits, specs_names):
    else:
        metadata_v2 = ',\n'.join(metadata_v2)
    # Add macros to only include needed cubins during compilation.
-    for sm in cubins_dict.keys():
+    # Collect all SM versions from all dictionaries
+    all_sms = sorted(
+        set(
+            list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) +
+            list(launchers_dict.keys())))
+
+    for sm in all_sms:
        macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
        macro_end = f"#endif\n"
-        cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
+
+        # Add cubin array declarations
+        if sm in cubins_dict:
+            cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
+
+        # Add cubin length declarations
        if sm in cubin_lens_dict:
            cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])

+        # Add launcher declarations
+        if sm in launchers_dict:
+            launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end])
+
    unroll_config_v1 = ',\n'.join(unroll_config_v1)
    unroll_config_v2 = ',\n'.join(unroll_config_v2)
    cubins = '\n'.join(cubins)
    cubin_lens = '\n'.join(cubin_lens)
+    launchers = '\n'.join(launchers)
    local_ns_open = ns_open
    local_ns_close = ns_close if generate_cu_trtllm else '}'
    launcher_line = '''
@ -3431,7 +3445,157 @@ static const struct TestMetaV2

 '''.format(**locals(), copyright=copyright)

-    return code
+    # Generate header content (.h file)
+    if "GENERATE_CUBIN" in os.environ:
+        header_content = '''\
+{copyright}
+#pragma once
+
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN
+namespace kernels{{
+
+struct FusedMultiHeadAttentionKernelMetaInfoV2
+{{
+    Data_type mDataTypeIn;
+    Data_type mDataTypeOut;
+    unsigned int mS;
+    unsigned int mStepQ;
+    unsigned int mStepKV;
+    unsigned int mD;
+    unsigned int mDV;
+    unsigned int mSageBlockSizeQ;
+    unsigned int mSageBlockSizeK;
+    unsigned int mSageBlockSizeV;
+    unsigned int mSM;
+    const unsigned char* mCubin;
+    unsigned int mCubinSize;
+    const char* mFuncName;
+    unsigned int mSharedMemBytes;
+    unsigned int mThreadsPerCTA;
+    unsigned int mUnrollStep;
+    int mAttentionMaskType;
+    int mAttentionInputLayout;
+    bool mInterleaved;
+    bool mFlashAttention;
+    bool mWarpSpecialization;
+    bool mFP32Accumulation;
+    bool mAlibiSupported;
+    bool mTiled;
+    bool mEnableAttnLogitSoftcapping;
+    bool mReturnSoftmaxStats;{launcher_line}
+}};
+
+extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
+extern const int sMhaKernelMetaInfosV2Size;
+
+}} // namespace kernels
+TRTLLM_NAMESPACE_END
+'''.format(**locals(), copyright=copyright)
+        # Generate source content (.cpp file)
+        source_content = '''\
+{copyright}
+
+#include "tensorrt_llm/common/config.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cuda_runtime_api.h>
+
+{local_ns_open}
+
+//--- Cubin Arrays
+{cubins}
+
+//--- Cubin Lengths
+{cubin_lens}
+
+{local_ns_close}
+
+using namespace tensorrt_llm::kernels;
+
+namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{
+
+class Fused_multihead_attention_params_v2;
+class Launch_params;
+
+//--- Kernel Launchers
+{launchers}
+
+// FIXME: These are duplicated declarations, we should remove them in the future.
+constexpr int32_t kSM_70 = 70;
+constexpr int32_t kSM_72 = 72;
+constexpr int32_t kSM_75 = 75;
+constexpr int32_t kSM_80 = 80;
+constexpr int32_t kSM_86 = 86;
+constexpr int32_t kSM_89 = 89;
+constexpr int32_t kSM_90 = 90;
+constexpr int32_t kSM_100 = 100;
+constexpr int32_t kSM_100f = 10100;
+constexpr int32_t kSM_103 = 103;
+constexpr int32_t kSM_120 = 120;
+constexpr int32_t kSM_121 = 121;
+
+// FIXME: These are duplicated declarations, we should remove them in the future.
+enum Data_type
+{{
+    DATA_TYPE_BOOL,
+    DATA_TYPE_FP16,
+    DATA_TYPE_FP32,
+    DATA_TYPE_INT4,
+    DATA_TYPE_INT8,
+    DATA_TYPE_INT32,
+    DATA_TYPE_BF16,
+    DATA_TYPE_E2M1,
+    DATA_TYPE_E4M3,
+    DATA_TYPE_E5M2
+}};
+
+struct FusedMultiHeadAttentionKernelMetaInfoV2
+{{
+    Data_type mDataTypeIn;
+    Data_type mDataTypeOut;
+    unsigned int mS;
+    unsigned int mStepQ;
+    unsigned int mStepKV;
+    unsigned int mD;
+    unsigned int mDV;
+    unsigned int mSageBlockSizeQ;
+    unsigned int mSageBlockSizeK;
+    unsigned int mSageBlockSizeV;
+    unsigned int mSM;
+    const unsigned char* mCubin;
+    unsigned int mCubinSize;
+    const char* mFuncName;
+    unsigned int mSharedMemBytes;
+    unsigned int mThreadsPerCTA;
+    unsigned int mUnrollStep;
+    int mAttentionMaskType;
+    int mAttentionInputLayout;
+    bool mInterleaved;
+    bool mFlashAttention;
+    bool mWarpSpecialization;
+    bool mFP32Accumulation;
+    bool mAlibiSupported;
+    bool mTiled;
+    bool mEnableAttnLogitSoftcapping;
+    bool mReturnSoftmaxStats;{launcher_line}
+}};
+
+extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
+{metadata_v2}
+}};
+
+extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]);
+}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels
+'''.format(**locals(), copyright=copyright)
+    else:
+        # Non-GENERATE_CUBIN mode: use old behavior
+        header_content = code
+        source_content = None
+
+    return header_content, source_content


 # This is used to add some kernels running in cubins for passing CI cases.
@ -3449,9 +3613,20 @@ def modify_cubin_header(cubin_header):
        return result

    target = "#ifndef EXCLUDE_SM_80"
-    addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
-extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;"""
-    result = add_kernel_line(result, target, addition)
+    addition_cubin_array = """
+#ifndef EXCLUDE_SM_80
+extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
+#endif
+"""
+    addition_cubin_length = """
+#ifndef EXCLUDE_SM_80
+extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;
+#endif
+"""
+    # Add cubin array and length into there corresponding sections.
+    result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array)
+    result = add_kernel_line(result, "//--- Cubin Lengths",
+                             addition_cubin_length)

    def modify_kernel_line(result, target, new_line):
        lines = result.split('\n')
@ -3534,13 +3709,22 @@ def generate_files(specs_names):
    output = output.decode('utf-8').strip()
    # this gives: kname, smem bytes, threads_per_cta, loop_step
    kernel_traits = [traits.split() for traits in output.splitlines()]
-    cubin_header = get_cubin_header(kernel_traits, valid_specs_names)
+    # Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files
+    # To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header
+    cubin_header, cubin_source = get_cubin_header(kernel_traits,
+                                                  valid_specs_names)
    if generate_cu_trtllm:
-        cubin_header = modify_cubin_header(cubin_header)
+        cubin_source = modify_cubin_header(cubin_source)

+    # Write fmha_cubin.h file
    with open('./generated/fmha_cubin.h', 'w') as f:
        f.write(cubin_header)

+    # Write fmha_cubin.cpp file (same directory as fmha_cubin.h file)
+    if cubin_source is not None:
+        with open('./generated/fmha_cubin.cpp', 'w') as f:
+            f.write(cubin_source)
+

 def enumerate_hgmma_tma_kernels(specs, sm=90):
    specs.append(
--- a/cpp/kernels/fmha_v2/train_ops/Makefile
+++ b/cpp/kernels/fmha_v2/train_ops/Makefile
@ -1,18 +1,18 @@
 # ##################################################################################################
-#  Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
-#  Redistribution and use in source and binary forms, with or without modification, are not permit-
-#  ted.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-#  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-#  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-#  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-#  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFIT;
-#  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-#  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# http://www.apache.org/licenses/LICENSE-2.0
 #
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # ##################################################################################################

 # #################################################################################################
--- a/cpp/kernels/fmha_v2/train_ops/train_setup.py
+++ b/cpp/kernels/fmha_v2/train_ops/train_setup.py
@ -32,20 +32,20 @@ dtype2traits = {

 fmha_dgrad_v2_flash_attention_template = '''\
 /***************************************************************************************************
- * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
 *
- * Redistribution and use in source and binary forms, with or without modification, are not permit-
- * ted.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
 *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * http://www.apache.org/licenses/LICENSE-2.0
 *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
 **************************************************************************************************/

 #include "fused_multihead_attention_fprop.h"
@ -157,20 +157,20 @@ void run_fmha_dgrad_v2_flash_attention_{dtype}_S_{head_size}_sm{sm}(

 fmha_fprop_v2_flash_attention_template = '''\
 /***************************************************************************************************
- * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
 *
- * Redistribution and use in source and binary forms, with or without modification, are not permit-
- * ted.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
 *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * http://www.apache.org/licenses/LICENSE-2.0
 *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
 **************************************************************************************************/

 #include "fused_multihead_attention_fprop.h"
--- a/cpp/kernels/xqa/gen_cpp_header.py
+++ b/cpp/kernels/xqa/gen_cpp_header.py
@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/*
 */

 #pragma once
-namespace tensorrt_llm {
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels {
 '''

@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}";
 '''

 TEMPLATE_EPILOGUE = '''}
-}
+TRTLLM_NAMESPACE_END
+
 '''

 D = defaultdict(list)
--- a/cpp/kernels/xqa/gen_cubins.py
+++ b/cpp/kernels/xqa/gen_cubins.py
@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/*
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-namespace tensorrt_llm
-{
+
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 // clang-format off
@ -96,7 +98,7 @@ namespace kernels
 cpp_file_suffex_text = R"""
 // clang-format on
 } // namespace kernels
-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
 """

 cubin_meta_info_struct_prefix_text = R"""
--- a/cpp/kernels/xqa/mha.cu
+++ b/cpp/kernels/xqa/mha.cu
@ -466,20 +466,53 @@ using WarpAcc = WarpAccT<warpTile.y, warpTile.x>;
 #define MMAS_N_PER_MASK 2

 __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskType const* mask, uint32_t rowOffset,
-    uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize)
+    uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize
+#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
+    ,
+    int32_t tok0WinBeg, uint32_t seqIter, uint32_t const cacheSeqLen, uint32_t const warpTileTokenBeg
+#endif
+)
 {
    uint32_t const idxInQuad = laneId() % 4;
    uint32_t const idxQuad = laneId() / 4;
    // Packed mask is aligned with 32 bits (2 uint16_t).
    uint32_t const nbPackedMasksPerRow = divUp(qSeqLen, 32u) * 2u;
    uint16_t const* uint16Mask = reinterpret_cast<uint16_t const*>(mask);
+    constexpr uint64_t fullMask = ~uint64_t{0};
+#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
+    Range const tileRange = {warpTileTokenBeg, warpTileTokenBeg + warpTile.x};
+    Range const maxMaskOutRange = {0, mha::max(0, tok0WinBeg) + (nbValidRows / MMAS_N_PER_MASK - 1)};
+    bool const ctaNeedBegMask = tileRange.beg < maxMaskOutRange.end;
+    assert(ctaNeedBegMask == overlap(tileRange, maxMaskOutRange));
+    int32_t const tok0NbMaskOut = int32_t(tok0WinBeg) - int32_t(warpTileTokenBeg);
+    uint32_t const nbSeqItersWithoutSpecDecMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x;
+    bool const ctaNeedSpecDecMask = (seqIter >= nbSeqItersWithoutSpecDecMask);
+#else
+    constexpr bool ctaNeedBegMask = false;
+    bool const ctaNeedSpecDecMask = true;
+    int32_t const tok0NbMaskOut = -2147483648;
+#endif
+    bool const needMask = ctaNeedBegMask || ctaNeedSpecDecMask;
+
+    if (!needMask)
+    {
+        return;
+    }
 #pragma unroll
    for (uint32_t m = 0; m < acc.rows; m++)
    {
 #pragma unroll
        for (uint32_t i = 0; i < InstAcc::rows; i++)
        {
-            uint32_t const tokenRow = min((rowOffset + instM * m + idxQuad + i * 8) / headGrpSize, actualQSeqLen - 1);
+            uint32_t const idxQTokInCta = (rowOffset + instM * m + idxQuad + i * 8) / headGrpSize;
+            uint32_t const tokenRow = min(idxQTokInCta, actualQSeqLen - 1);
+#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
+            int32_t const begNbMaskOut = tok0NbMaskOut + int32_t(idxQTokInCta);
+            uint64_t const begMask = (begNbMaskOut > 0 ? fullMask << begNbMaskOut : fullMask);
+#else
+            uint64_t const begMask = fullMask;
+#endif
+
 #pragma unroll
            for (uint32_t mask_n = 0; mask_n < acc.cols / MMAS_N_PER_MASK; mask_n++)
            {
@ -491,12 +524,15 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy
                uint32_t const maskPos1 = lastCol + actualQSeqLen < nbValidCols
                    ? 0u
                    : min(lastCol + actualQSeqLen - nbValidCols, actualQSeqLen - 1);
-                uint32_t packedMask = 0u;
                uint32_t const maskPosStart = (maskPos0 / 16) * 16;
-                reinterpret_cast<uint16_t*>(&packedMask)[0]
-                    = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)];
-                reinterpret_cast<uint16_t*>(&packedMask)[1]
-                    = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)];
+                uint32_t packedMask = ~uint32_t{0};
+                if (ctaNeedSpecDecMask)
+                {
+                    reinterpret_cast<uint16_t*>(&packedMask)[0]
+                        = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)];
+                    reinterpret_cast<uint16_t*>(&packedMask)[1]
+                        = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)];
+                }
 #pragma unroll
                for (uint32_t nj = 0; nj < MMAS_N_PER_MASK; nj++)
                {
@ -510,7 +546,11 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy
                        bool const maskFlag = col + actualQSeqLen < nbValidCols
                            ? true
                            : packedMask & (1u << ((col + actualQSeqLen - nbValidCols) - maskPosStart));
-                        acc(m, n)(i, j) = maskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax;
+
+                        bool const begMaskFlag = ctaNeedBegMask ? (begMask & (1ULL << col)) : true;
+
+                        acc(m, n)(i, j)
+                            = maskFlag && begMaskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax;
                    }
                }
            }
@ -1611,8 +1651,14 @@ CUBIN_EXPORT __global__
 #endif

    uint32_t const cacheSeqLen = getCacheSeqLen<usePagedKVCache>(cacheList, idxReq);
-#if SLIDING_WINDOW
+#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE
+    uint32_t const tok0SeqLen = cacheSeqLen - actualQSeqLen + 1 + idxHeadTokenInGrp; // ctaTokOffset;
+    int32_t const tok0WinBeg = int32_t(tok0SeqLen) - int32_t(slidingWinSize);
+    uint32_t const nbTotalSkipTokens = mha::max(0, tok0WinBeg);
+
+#elif SLIDING_WINDOW
    bool const rtIsReallySliding = (cacheSeqLen > slidingWinSize);
+    assert(!SPEC_DEC || !rtIsReallySliding);
    uint32_t const nbTotalSkipTokens = rtIsReallySliding ? cacheSeqLen - slidingWinSize : 0;
 #else
    constexpr bool rtIsReallySliding = false;
@ -1626,7 +1672,9 @@ CUBIN_EXPORT __global__
 #endif

    uint32_t const nbSeqIters = useKVCache ? divUp(cacheSeqLen, ctaTile.x) : 0;
-#if SPEC_DEC
+#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE
+    uint32_t const nbSeqItersWithoutMask = nbSkipLeadingTiles;
+#elif SPEC_DEC
    uint32_t const nbSeqItersWithoutMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x;
 #endif

@ -1912,8 +1960,12 @@ CUBIN_EXPORT __global__
            if (seqIter >= nbSeqItersWithoutMask)
            {
                uint32_t const nbValidCols = (warpTileTokenBeg < cacheSeqLen ? cacheSeqLen - warpTileTokenBeg : 0U);
-                applyMaskFromInput(
-                    warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize);
+                applyMaskFromInput(warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize
+#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE
+                    ,
+                    tok0WinBeg, seqIter, cacheSeqLen, warpTileTokenBeg
+#endif
+                );
            }
 #else
            bool const isFirstIter = (seqIter == nbSkipLeadingTiles);
--- a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp
+++ b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp
@ -247,7 +247,8 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
        {
            break;
        }
-        else if (req->isGenerationInProgressState())
+
+        if (req->isGenerationInProgressState())
        {
            scheduledRequests.emplace_back(req);
            reservedBlocks.decrementReservedBlocks(*req);
@ -296,7 +297,8 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
                {
                    break;
                }
-                else if (req->isContextInitState() || req->isDisaggGenerationInitState())
+
+                if (req->isContextInitState() || req->isDisaggGenerationInitState())
                {
                    bool enoughBlocks = reservedBlocks.enoughAvailableBlocks(*req);
                    bool enoughCrossBlocks
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
@ -360,6 +360,12 @@ public:
        RequestInfo info;
        auto const* connection = isAgent ? agentConnectionManager->recvConnectionAndRequestInfo(info)
                                         : mManager->recvConnect(DataContext{TransceiverTag::kID_TAG}, &id, sizeof(id));
+        if (connection == nullptr && !mManager->isRunning())
+        {
+            TLLM_LOG_WARNING(" recvRequestInfo connection is nullptr, maybe the server is terminating");
+            return info;
+        }
+
        if (!isAgent)
        {
            TLLM_CHECK(id == TransceiverTag::Id::REQUEST_SEND);
@ -616,6 +622,10 @@ private:
                if (!mReadyResponses.empty())
                {
                    auto const& requestInfo = recvRequestInfo();
+                    if (mTerminate || !mManager->isRunning())
+                    {
+                        return;
+                    }
                    auto reqId = requestInfo.getRequestId();

                    {
--- a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
@ -102,7 +102,7 @@ void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks
    for (auto const& block : blocks)
    {
        data.blocks.emplace_back(block->getHash(), block->getUniqueTokens(), block->getBlockKey().loraTaskId,
-            block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority());
+            block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority(), block->getExtraKeys());
    }

    enqueueEvent({mEventId++, data, windowSize, mAttentionDpRank});
--- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@ -284,6 +284,11 @@ tk::KVCacheIndex::UnderlyingType KVCacheBlock::getMemoryPoolBlockIndex() const
    return mMemoryPoolBlockIndex.get();
 }

+std::vector<MmKey> KVCacheBlock::getExtraKeys() const
+{
+    return mBlockKey.extraKeys;
+}
+
 bool KVCacheBlock::isPrimary() const
 {
    return mMemoryPoolBlockIndex.isPrimary();
--- a/cpp/tensorrt_llm/common/assert.cpp
+++ b/cpp/tensorrt_llm/common/assert.cpp
@ -27,7 +27,7 @@ bool initCheckDebug()
 }
 } // namespace

-bool DebugConfig::isCheckDebugEnabled()
+bool tensorrt_llm::DebugConfig::isCheckDebugEnabled()
 {
    static bool const debugEnabled = initCheckDebug();
    return debugEnabled;
--- a/cpp/tensorrt_llm/common/attentionOp.cpp
+++ b/cpp/tensorrt_llm/common/attentionOp.cpp
@ -16,6 +16,7 @@
 */
 #include "attentionOp.h"
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/memoryUtils.h"
--- a/cpp/tensorrt_llm/common/attentionOp.h
+++ b/cpp/tensorrt_llm/common/attentionOp.h
@ -16,6 +16,7 @@
 */
 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/opUtils.h"
 #include "tensorrt_llm/common/quantization.h"
@ -36,7 +37,9 @@
 #include <nccl.h>
 #endif // ENABLE_MULTI_DEVICE

-namespace tensorrt_llm::common::op
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common::op
 {

 class AttentionOp
@ -543,4 +546,6 @@ private:
    UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {};
 };

-} // namespace tensorrt_llm::common::op
+} // namespace common::op
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp
@ -16,6 +16,7 @@

 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cublasVersionCheck.h"
 #include <algorithm>
 #include <unordered_map>
@ -24,8 +25,8 @@
 #error CUDART_VERSION Undefined!
 #endif

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t

 } // namespace common

-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cublasMMWrapper.h
+++ b/cpp/tensorrt_llm/common/cublasMMWrapper.h
@ -16,6 +16,7 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <cublasLt.h>
 #include <cublas_v2.h>
@ -24,8 +25,8 @@
 #include <optional>
 #include <string>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -185,4 +186,4 @@ public:

 } // namespace common

-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh
+++ b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh
@ -16,12 +16,13 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
 #endif // ENABLE_BF16

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END

 // Operator definitions intentionally in global namespace
 namespace
--- a/cpp/tensorrt_llm/common/cudaBufferUtils.cuh
+++ b/cpp/tensorrt_llm/common/cudaBufferUtils.cuh
@ -16,6 +16,7 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <assert.h>
 #include <cstdlib>
@ -28,8 +29,8 @@
 #include <string>
 #include <type_traits>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {
 static __host__ __device__ int hash(int val)
@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer<DEPTH, CTAS_PER_CGA>
 };

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
@ -18,6 +18,7 @@

 #if defined(_WIN32)
 #include <windows.h>
+
 #define dllOpen(name) LoadLibrary("nv" name ".dll")
 #define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
 #define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
@ -29,6 +30,7 @@
 #endif // defined(_WIN32)

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/logger.h"
 #include <cuda.h>
@ -36,7 +38,9 @@
 #include <cstdio>
 #include <mutex>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters(
    return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config);
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
@ -17,6 +17,7 @@
 #ifndef CUDA_DRIVER_WRAPPER_H
 #define CUDA_DRIVER_WRAPPER_H

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/tllmException.h"

@ -25,7 +26,9 @@
 #include <cstdio>
 #include <memory>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 class CUDADriverWrapper
@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
    }
 }

-} // namespace tensorrt_llm::common
+} // namespace common

+TRTLLM_NAMESPACE_END
 /*
 * Macros compliant with TensorRT coding conventions
 */
--- a/cpp/tensorrt_llm/common/cudaFp8Utils.cu
+++ b/cpp/tensorrt_llm/common/cudaFp8Utils.cu
@ -14,6 +14,7 @@
 * limitations under the License.
 */

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
@ -24,8 +25,8 @@
 #include <limits>
 #include <type_traits>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {
 #ifdef ENABLE_FP8
@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3);

 #endif // ENABLE_FP8
 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp
+++ b/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp
@ -15,6 +15,7 @@
 */

 #include "tensorrt_llm/common/cudaProfilerUtils.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include <cstdint>
@ -54,7 +55,9 @@ std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIte

 } // namespace

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
@ -81,4 +84,6 @@ std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIter
    return std::make_pair(profileIterIdxs, stopIterIdxs);
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
+++ b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
@ -25,9 +25,10 @@
 #if ENABLE_BF16
 #include <cuda_bf16.h>
 #endif
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN

-namespace tensorrt_llm
-{
 namespace common
 {

@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
 #endif // ENABLE_FP8

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@ -16,6 +16,7 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/kernels/customAllReduceKernels.h"
@ -25,7 +26,9 @@
 using tensorrt_llm::kernels::AllReduceFusionOp;
 using tensorrt_llm::kernels::AllReduceStrategyType;

-namespace tensorrt_llm::utils::customAllReduceUtils
+TRTLLM_NAMESPACE_BEGIN
+
+namespace utils::customAllReduceUtils
 {

 constexpr size_t NUM_POINTERS_PER_RANK = 7;
@ -292,4 +295,6 @@ inline const std::unordered_map<int, AllReduceBestStrategyTableType> AllReduceBe
    {90, AllReduceBestStrategyTableSM90},
    {100, AllReduceBestStrategyTableSM100},
 };
-} // namespace tensorrt_llm::utils::customAllReduceUtils
+} // namespace utils::customAllReduceUtils
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@ -16,6 +16,7 @@
 */

 #include "envUtils.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/stringUtils.h"
@ -25,7 +26,9 @@
 #include <optional>
 #include <string>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 std::optional<int32_t> getIntEnv(char const* name)
@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy()
    return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@ -16,13 +16,16 @@
 */

 #pragma once
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <optional>
 #include <string>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {
 // Useful when you want to inject some debug code controllable with env var.
 std::optional<int32_t> getIntEnv(char const* name);
@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow();

 bool getEnvEplbForceGdrcopy();

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/lamportUtils.cuh
+++ b/cpp/tensorrt_llm/common/lamportUtils.cuh
@ -19,6 +19,7 @@
 #ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH
 #define TRTLLM_CUDA_LAMPORT_UTILS_CUH

+#include "tensorrt_llm/common/config.h"
 #include <array>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
@ -29,7 +30,9 @@

 #include "tensorrt_llm/common/cudaTypeUtils.cuh"

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 constexpr uint16_t kNEGZERO_FP16 = 0x8000U;
@ -279,6 +282,7 @@ private:
    }
 };

-} // namespace tensorrt_llm::common
+} // namespace common

+TRTLLM_NAMESPACE_END
 #endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH
--- a/cpp/tensorrt_llm/common/logger.cpp
+++ b/cpp/tensorrt_llm/common/logger.cpp
@ -15,12 +15,15 @@
 */

 #include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/tllmException.h"
 #include <cuda_runtime.h>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 Logger::Logger()
@ -70,4 +73,6 @@ Logger* Logger::getLogger()
    thread_local Logger instance;
    return &instance;
 }
-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/mathUtils.h
+++ b/cpp/tensorrt_llm/common/mathUtils.h
@ -16,10 +16,11 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include <cuda_runtime.h>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n)
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp
+++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp
@ -14,11 +14,15 @@
 * limitations under the License.
 */
 #include "mcastDevMemUtils.h"
+#include "tensorrt_llm/common/config.h"
 #include <unordered_map>

-namespace tensorrt_llm::common
+using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory;
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {
-using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;

 namespace
 {
@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
 {
    return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
 }
-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/mcastDevMemUtils.h
+++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.h
@ -15,13 +15,17 @@
 */
 #pragma once

-// Avoid circular dependency
+#include "tensorrt_llm/common/config.h"
+
 namespace tensorrt_llm::runtime
 {
 class McastDeviceMemory;
-}
+} // namespace tensorrt_llm::runtime

-namespace tensorrt_llm::common
+// Avoid circular dependency
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {
 using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
 // Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer!
@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf);
 // information. Thus a derived pointer cannot used as the key.
 McastDeviceMemory* findMcastDevMemBuffer(void* ptr);

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/memoryUtils.cu
+++ b/cpp/tensorrt_llm/common/memoryUtils.cu
@ -15,6 +15,7 @@
 */

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/memoryUtils.h"
@ -25,8 +26,8 @@

 #include <sanitizer/asan_interface.h>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -961,4 +962,5 @@ void calcAlignedPointers(
 }

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/memoryUtils.h
+++ b/cpp/tensorrt_llm/common/memoryUtils.h
@ -16,13 +16,14 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/cudaUtils.h"

 #include <cassert>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers(
 }

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/ncclUtils.h
+++ b/cpp/tensorrt_llm/common/ncclUtils.h
@ -16,6 +16,7 @@
 #pragma once

 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"

@ -46,7 +47,9 @@
 #include <dlfcn.h>
 #endif

-namespace tensorrt_llm::common::nccl_util
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common::nccl_util
 {

 //==============================================================================
@ -392,6 +395,8 @@ inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
    return std::make_pair(tensor, buffer);
 }

-} // namespace tensorrt_llm::common::nccl_util
+} // namespace common::nccl_util
+
+TRTLLM_NAMESPACE_END

 #endif // ENABLE_MULTI_DEVICE
--- a/cpp/tensorrt_llm/common/nvtxUtils.h
+++ b/cpp/tensorrt_llm/common/nvtxUtils.h
@ -25,10 +25,13 @@
 #if defined(__clang__)
 #pragma clang diagnostic pop
 #endif
+#include "tensorrt_llm/common/config.h"

 #include <array>

-namespace tensorrt_llm::common::nvtx
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common::nvtx
 {
 inline nvtx3::color nextColor()
 {
@ -46,8 +49,9 @@ inline nvtx3::color nextColor()
 #endif
 }

-} // namespace tensorrt_llm::common::nvtx
+} // namespace common::nvtx

+TRTLLM_NAMESPACE_END
 #define NVTX3_SCOPED_RANGE_WITH_NAME(range, name)                                                                      \
    ::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
 #define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)
--- a/cpp/tensorrt_llm/common/opUtils.cpp
+++ b/cpp/tensorrt_llm/common/opUtils.cpp
@ -29,6 +29,7 @@
 #include <mutex>
 #include <thread>

+TRTLLM_NAMESPACE_BEGIN
 #if ENABLE_MULTI_DEVICE

 std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
@ -378,3 +379,5 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
        });
    return creator();
 }
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/opUtils.h
+++ b/cpp/tensorrt_llm/common/opUtils.h
@ -17,6 +17,7 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/workspace.h"

@ -37,7 +38,9 @@
 #include <string>
 #include <unordered_map>

-namespace tensorrt_llm::common::op
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common::op
 {

 // Write values into buffer
@ -178,7 +181,7 @@ struct hash

 // for testing only
 void const* getCommSessionHandle();
-} // namespace tensorrt_llm::common::op
+} // namespace common::op

 inline bool isBuilding()
 {
@ -220,6 +223,8 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
 std::shared_ptr<cublasHandle_t> getCublasHandle();
 std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();

+TRTLLM_NAMESPACE_END
+
 #ifndef DEBUG

 #define PLUGIN_CHECK(status)                                                                                           \
--- a/cpp/tensorrt_llm/common/quantTypeUtils.cuh
+++ b/cpp/tensorrt_llm/common/quantTypeUtils.cuh
@ -16,14 +16,15 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <float.h>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3>
 #endif // ENABLE_FP8

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
+++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
@ -21,6 +21,7 @@
 #else
 #include <cooperative_groups.h>
 #endif
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@ -30,8 +31,8 @@

 namespace cg = cooperative_groups;

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {

@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input)
 }

 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/safetensors.cpp
+++ b/cpp/tensorrt_llm/common/safetensors.cpp
@ -17,6 +17,7 @@
 #include "safetensors.h"
 #include "nlohmann/json.hpp"
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include <NvInferRuntime.h>
 #include <cstdint>
 #include <fstream>
@ -25,7 +26,9 @@
 #include <utility>
 #include <vector>

-namespace tensorrt_llm::common::safetensors
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common::safetensors
 {
 using nvinfer1::DataType;

@ -164,4 +167,6 @@ std::shared_ptr<ISafeTensor> ISafeTensor::open(char const* filename)
 {
    return std::make_shared<SafeTensor>(filename);
 }
-} // namespace tensorrt_llm::common::safetensors
+} // namespace common::safetensors
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/safetensors.h
+++ b/cpp/tensorrt_llm/common/safetensors.h
@ -16,6 +16,7 @@

 #pragma once
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/logger.h"
 #include <NvInferRuntime.h>
 #include <cstdint>
@ -23,7 +24,9 @@
 #include <memory>
 #include <utility>

-namespace tensorrt_llm::common::safetensors
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common::safetensors
 {
 class INdArray
 {
@ -58,4 +61,6 @@ public:
    virtual ~ISafeTensor() = default;
 };

-} // namespace tensorrt_llm::common::safetensors
+} // namespace common::safetensors
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/stlUtils.h
+++ b/cpp/tensorrt_llm/common/stlUtils.h
@ -16,12 +16,15 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include <functional>
 #include <numeric>
 #include <optional>
 #include <sstream>

-namespace tensorrt_llm::common::stl_utils
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common::stl_utils
 {

 template <typename TInputIt, typename TOutputIt, typename TBinOp>
@ -120,4 +123,6 @@ std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOpe
    return oss.str();
 }

-} // namespace tensorrt_llm::common::stl_utils
+} // namespace common::stl_utils
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/stringUtils.cpp
+++ b/cpp/tensorrt_llm/common/stringUtils.cpp
@ -16,6 +16,7 @@

 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"

 #include <cerrno>
 #include <cstdarg>
@ -23,7 +24,9 @@
 #include <iostream>
 #include <string>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
@ -73,4 +76,6 @@ std::unordered_set<std::string> str2set(std::string const& input, char delimiter
    return values;
 };

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/timestampUtils.cpp
+++ b/cpp/tensorrt_llm/common/timestampUtils.cpp
@ -14,13 +14,16 @@
 * limitations under the License.
 */

+#include "tensorrt_llm/common/config.h"
 #include <chrono>
 #include <iomanip>
 #include <sstream>

 #include "tensorrt_llm/common/timestampUtils.h"

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 std::string getCurrentTimestamp()
@ -39,4 +42,6 @@ std::string getCurrentTimestamp()
    return stream.str();
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/timestampUtils.h
+++ b/cpp/tensorrt_llm/common/timestampUtils.h
@ -14,12 +14,17 @@
 * limitations under the License.
 */

+#include "tensorrt_llm/common/config.h"
 #include <string>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 /// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
 std::string getCurrentTimestamp();

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/tllmException.cpp
+++ b/cpp/tensorrt_llm/common/tllmException.cpp
@ -15,6 +15,7 @@
 */

 #include "tensorrt_llm/common/tllmException.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"

 #include <cinttypes>
@ -26,7 +27,9 @@
 #endif
 #include <sstream>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 namespace
@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
    return mErrorCode;
 }

-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/workspace.h
+++ b/cpp/tensorrt_llm/common/workspace.h
@ -14,10 +14,13 @@
 * limitations under the License.
 */
 #pragma once
+#include "tensorrt_llm/common/config.h"
 #include <cstddef>
 #include <cstdint>

-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {

 // CuBLAS >= 12.9.1 requires 256-byte alignment.
@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize(
    return total;
 }

-}; // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
@ -18,10 +18,11 @@
 #include <cuda_runtime_api.h>

 #include "cutlass/device_kernel.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace cutlass_extensions
 {

@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel()
 }

 } // namespace cutlass_extensions
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
@ -30,10 +30,11 @@
 #include "cutlass/epilogue/thread/linear_combination_relu.h"
 #include "cutlass/epilogue/thread/linear_combination_silu.h"
 #include "cutlass_extensions/epilogue/thread/fused_activations.h"
+#include "tensorrt_llm/common/config.h"
 #include <cutlass/epilogue/fusion/operations.hpp>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace cutlass_extensions
 {

@ -150,4 +151,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
 };

 } // namespace cutlass_extensions
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@ -24,10 +24,11 @@

 #include "cute/tensor.hpp"
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/tllmException.h"

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace cutlass_extensions
 {

@ -535,4 +536,5 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
 }

 } // namespace cutlass_extensions
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
@ -319,6 +319,10 @@ AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo(batc
 {
    while (true)
    {
+        if (!mIsRunning)
+        {
+            return nullptr;
+        }
        updateUnhandledNotifications();
        std::scoped_lock lock(mNotificationMutex);
        auto it = mUnhandledNotifications.begin();
@ -491,6 +495,11 @@ void AgentConnectionManager::waitForNotification(std::string const& remoteAgentN
    while (true)
    {

+        if (!mIsRunning)
+        {
+            return;
+        }
+
        updateUnhandledNotifications();
        std::scoped_lock lock(mNotificationMutex);
        auto it = mUnhandledNotifications.begin();
@ -587,6 +596,13 @@ std::string const& AgentConnectionManager::getAgentName() const

 AgentConnectionManager::~AgentConnectionManager()
 {
+    mIsRunning = false;
    m_Agent->deregisterMemory(mRegMemDescs);
 }
+
+bool AgentConnectionManager::isRunning() const
+{
+    return mIsRunning;
+}
+
 } // namespace tensorrt_llm::executor::kv_cache
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
@ -296,6 +296,7 @@ public:
    void waitForNotification(std::string const& remoteAgentName, NotificationType& expectedInfo);
    void waitForSyncInfo(std::string const& remoteAgentName, NotificationSyncInfo& syncInfo);
    void waitForReadySignal(std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo);
+    [[nodiscard]] bool isRunning() const override;

 private:
    std::map<std::string, std::shared_ptr<AgentConnection>> mConnections;
@ -309,6 +310,7 @@ private:
    int mDeviceId;
    std::string mAgentName;
    MemoryDescs mRegMemDescs;
+    std::atomic<bool> mIsRunning{true};
 };

 } // namespace tensorrt_llm::executor::kv_cache
--- a/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.cpp
@ -77,4 +77,13 @@ CommState const& MpiConnectionManager::getCommState() const
    return mCommState;
 }

+bool MpiConnectionManager::isRunning() const
+{
+    return mIsRunning;
+}
+
+MpiConnectionManager::~MpiConnectionManager()
+{
+    mIsRunning = false;
+}
 } // namespace tensorrt_llm::executor::kv_cache
--- a/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.h
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mpi_utils/connection.h
@ -42,14 +42,17 @@ class MpiConnectionManager : public ConnectionManager
 {
 public:
    MpiConnectionManager(mpi::MpiComm const* comm);
+    ~MpiConnectionManager();
    MpiConnection const* recvConnect(DataContext const& ctx, void* data, size_t size) override;
    [[nodiscard]] std::vector<Connection const*> getConnections(CommState const& state) override;
    [[nodiscard]] CommState const& getCommState() const override;
+    [[nodiscard]] bool isRunning() const override;

 private:
    mpi::MpiComm const* mComm;
    std::map<int, MpiConnection> mConnections;
    CommState mCommState;
+    std::atomic<bool> mIsRunning{true};
 };

 } // namespace tensorrt_llm::executor::kv_cache
--- a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp
@ -454,21 +454,10 @@ void NixlTransferAgent::invalidateRemoteAgent(std::string const& name)

 void NixlTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage)
 {
-    if (name == mName)
-    {
-        // FIXME: nixl does not support gen notif to itself ,but support local transfer. we use local transfer to notify
-        // itself
-        MemoryDescs descs{MemoryType::kDRAM, {MemoryDesc{mDRamSrcBuffer}, MemoryDesc{mDRamDstBuffer}}};
-        TransferRequest request{TransferOp::kWRITE, descs, descs, name, syncMessage};
-        auto request_status = submitTransferRequests(request);
-        request_status->wait();
-    }
-    else
-    {
-        auto status = mRawAgent->genNotif(name, syncMessage);
-        TLLM_CHECK_WITH_INFO(
-            status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str());
-    }
+
+    auto status = mRawAgent->genNotif(name, syncMessage);
+    TLLM_CHECK_WITH_INFO(
+        status == NIXL_SUCCESS, "genNotif failed with status: %s", nixlEnumStrings::statusStr(status).c_str());
 }

 [[nodiscard]] std::unordered_map<std::string, std::vector<SyncMessage>> NixlTransferAgent::getNotifiedSyncMessages()
--- a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.cpp
@ -504,7 +504,7 @@ UcxConnectionManager::~UcxConnectionManager()
        socket.close();
        mZmqRepThread.join();
    }
-
+    mIsRunning = false;
    mZmqRepSocket.close();

    mZmqContext.close();
@ -673,6 +673,11 @@ std::vector<Connection const*> UcxConnectionManager::getConnections(CommState co
    return ret;
 }

+bool UcxConnectionManager::isRunning() const
+{
+    return mIsRunning;
+}
+
 CommState const& UcxConnectionManager::getCommState() const
 {
    return mCommState;
--- a/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.h
+++ b/cpp/tensorrt_llm/executor/cache_transmission/ucx_utils/ucxCacheCommunicator.h
@ -62,6 +62,7 @@ private:
    zmq::socket_t mZmqRepSocket;
    std::string mZmqRepEndpoint;
    std::thread mZmqRepThread;
+    std::atomic<bool> mIsRunning{true};

    UcxConnection::ConnectionIdType getNewConnectionId(std::shared_ptr<ucxx::Endpoint> const& newEp);
    UcxConnection::ConnectionIdType addConnection(std::string const& ip, uint16_t port);
@ -85,6 +86,8 @@ public:
    {
        return mRank;
    }
+
+    [[nodiscard]] bool isRunning() const override;
 };

 #if defined(__clang__)
--- a/cpp/tensorrt_llm/executor/executorImpl.cpp
+++ b/cpp/tensorrt_llm/executor/executorImpl.cpp
@ -52,7 +52,8 @@ namespace tensorrt_llm::executor
 namespace
 {

-[[nodiscard]] bool executorConfigIsValid(ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
+[[nodiscard]] bool executorConfigIsValid(
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
 {
    // Make sure logic in this function matches fixExecutorConfig
    if (executorConfig.getEnableChunkedContext())
@ -65,8 +66,8 @@ namespace
    return true;
 }

-[[nodiscard]] ExecutorConfig fixExecutorConfig(
-    ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
+[[nodiscard]] ::tensorrt_llm::executor::ExecutorConfig fixExecutorConfig(
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
 {
    // Make sure logic in this function matches executorConfigIsValid
    auto fixedExecutorConfig = executorConfig;
@ -241,7 +242,7 @@ private:

 void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt,
    std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig,
-    ExecutorConfig const& executorConfig, bool isEncoder,
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder,
    std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
 {
    auto const gpusPerNode = jsonConfig.getGpusPerNode();
@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& model

 Executor::Impl::Impl(std::filesystem::path const& modelPath,
    std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json");

@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath,

 Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr,
    std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr,
-    ModelType const modelType, ExecutorConfig const& executorConfig,
+    ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig,
    std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
 {
    auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr);
@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json
 }

 Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto const& worldConfig = model->getWorldConfig();
    auto const tp = worldConfig.getTensorParallelism();
@ -388,7 +389,7 @@ Executor::Impl::~Impl()
    shutdown();
 }

-void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
+void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);

@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig)

 std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine,
    runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto const gptModelType = [&executorConfig, &modelConfig]()
    {
@ -512,7 +513,7 @@ std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& raw

 std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine,
    runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto fixedExecutorConfig = ExecutorConfig{};
    fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig());
@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm(
 }

 void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp,
-    ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
    std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig,
    std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig)
 {
@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig
 }

 void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp,
-    ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
    std::filesystem::path const& modelPath)
 {
 #if ENABLE_MULTI_DEVICE
--- a/cpp/tensorrt_llm/executor/serialization.cpp
+++ b/cpp/tensorrt_llm/executor/serialization.cpp
@ -2333,6 +2333,7 @@ size_t Serialization::serializedSize(KVCacheStoredBlockData const& data)
    totalSize += su::serializedSize(data.loraId);
    totalSize += su::serializedSize(data.cacheLevel);
    totalSize += su::serializedSize(data.priority);
+    totalSize += su::serializedSize(data.mmKeys);
    return totalSize;
 }

@ -2343,6 +2344,7 @@ void Serialization::serialize(KVCacheStoredBlockData const& data, std::ostream&
    su::serialize(data.loraId, os);
    su::serialize(data.cacheLevel, os);
    su::serialize(data.priority, os);
+    su::serialize(data.mmKeys, os);
 }

 KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::istream& is)
@ -2352,8 +2354,9 @@ KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::ist
    auto loraId = su::deserialize<std::optional<tensorrt_llm::runtime::LoraTaskIdType>>(is);
    auto cacheLevel = su::deserialize<SizeType32>(is);
    auto priority = su::deserialize<SizeType32>(is);
+    auto mmKeys = su::deserialize<std::vector<tensorrt_llm::batch_manager::kv_cache_manager::MmKey>>(is);

-    return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority};
+    return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority, mmKeys};
 }

 // KVcacheRemovedData
--- a/cpp/tensorrt_llm/kernels/CMakeLists.txt
+++ b/cpp/tensorrt_llm/kernels/CMakeLists.txt
@ -40,9 +40,7 @@ list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*")

 function(filter_cuda_archs ARCH SOURCES_VAR)
  if(NOT "${ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
-    set(FILTER_REGEX
-        ".*_sm(_)?${ARCH}[.]cubin[.]cpp|^.*Sm(_)?${ARCH}.*cubin.cpp$|.*_sm(_)?${ARCH}[.]cu|^.*Sm(_)?${ARCH}.*cu$"
-    )
+    set(FILTER_REGEX ".*[Ss][Mm]_?${ARCH}(af)?.*(cubin\.cpp|\.cu)$")
    list(APPEND SOURCES ${${SOURCES_VAR}})
    list(APPEND SOURCES_FILTERED ${SOURCES})
    list(FILTER SOURCES_FILTERED INCLUDE REGEX "${FILTER_REGEX}")
--- a/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h
+++ b/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h
@ -16,9 +16,12 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"

-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels
 {

 void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache,
@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca
    cudaStream_t stream = 0);

 }
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/IndexerTopK.h
+++ b/cpp/tensorrt_llm/kernels/IndexerTopK.h
@ -17,12 +17,15 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>

 #include "tensorrt_llm/common/cudaUtils.h"

-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
+
+namespace kernels
 {
 void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
    int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con
    int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
    cudaStream_t const stream = 0);

-} // namespace tensorrt_llm::kernels
+} // namespace kernels
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/attentionMask.cu
+++ b/cpp/tensorrt_llm/kernels/attentionMask.cu
@ -15,6 +15,7 @@
 */
 #include "attentionMask.h"
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/cudaUtils.h"
@ -24,8 +25,8 @@

 using namespace tensorrt_llm::common;

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace kernels
 {

@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const&
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 } // namespace kernels
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/attentionMask.h
+++ b/cpp/tensorrt_llm/kernels/attentionMask.h
@ -15,6 +15,7 @@
 */
 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/runtime/iTensor.h"
@ -25,8 +26,8 @@

 namespace tc = tensorrt_llm::common;

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace kernels
 {

@ -64,4 +65,5 @@ template <typename MaskDataType>
 void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream);

 } // namespace kernels
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/banBadWords.cu
+++ b/cpp/tensorrt_llm/kernels/banBadWords.cu
@ -14,14 +14,15 @@
 * limitations under the License.
 */

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/banBadWords.h"

 using namespace tensorrt_llm::common;
 using namespace tensorrt_llm::runtime;

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace kernels
 {

@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt
    SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream);

 } // namespace kernels
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/banBadWords.h
+++ b/cpp/tensorrt_llm/kernels/banBadWords.h
@ -16,12 +16,13 @@

 #pragma once

+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/runtime/common.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>

-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace kernels
 {

@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr,
    cudaStream_t stream);

 } // namespace kernels
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
--- a/Show More
+++ b/Show More