[None][fix] Introduce inline namespace to avoid symbol collision (#9541)

Signed-off-by: Yihan Wang <yihwang@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-12-12 23:32:15 +08:00 · 2025-12-12 23:32:15 +08:00 · 9df4dad3b6
commit 9df4dad3b6
parent af315d8ef1
621 changed files with 4168 additions and 9576 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_context_wait_performance.png  filter=lfs diff=lfs merge=lfs -text
 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -74,6 +74,7 @@ llm-test-workspace/
 cpp/include/tensorrt_llm/executor/version.h
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
 .devcontainer/.env
 /examples/layer_wise_benchmarks/profiles/
--- a/benchmarks/cpp/utils/utils.cpp
+++ b/benchmarks/cpp/utils/utils.cpp
@ -1,6 +1,7 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
 *AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -17,13 +18,16 @@
 */
 #include "utils.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/logger.h"
 #include <random>
 #include <filesystem>
 #include <fstream>
-namespace tensorrt_llm::benchmark
+TRTLLM_NAMESPACE_BEGIN
 namespace benchmark
 {
 std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
@ -98,7 +102,8 @@ Samples parseWorkloadJson(
    if (samples.size() < maxNumSamples)
    {
        TLLM_LOG_WARNING(
-            "Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
+            "Dataset size %zu is smaller than given max_num_samples "
            "%d, max_num_samples will be ignored.\n",
            samples.size(), maxNumSamples);
    }
    return samples;
@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
    return os;
 }
-} // namespace tensorrt_llm::benchmark
+} // namespace benchmark
 TRTLLM_NAMESPACE_END
--- a/benchmarks/cpp/utils/utils.h
+++ b/benchmarks/cpp/utils/utils.h
@ -16,6 +16,7 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/executor/executor.h"
 #include <cstdint>
@ -29,7 +30,9 @@
 #pragma once
-namespace tensorrt_llm::benchmark
+TRTLLM_NAMESPACE_BEGIN
 namespace benchmark
 {
 // using namespace tensorrt_llm::batch_manager;
@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int
 std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);
-} // namespace tensorrt_llm::benchmark
+} // namespace benchmark
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/algorithm.h
+++ b/cpp/include/tensorrt_llm/common/algorithm.h
@ -16,8 +16,9 @@
 #pragma once
-namespace tensorrt_llm
+#include "tensorrt_llm/common/config.h"
-{
+
 TRTLLM_NAMESPACE_BEGIN
 // Base class for algorithms
 struct Algorithm
@ -29,4 +30,4 @@ struct Algorithm
    Algorithm& operator=(Algorithm const&) = delete;
 };
-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/arrayView.h
+++ b/cpp/include/tensorrt_llm/common/arrayView.h
@ -17,9 +17,13 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include <cstdint>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 //!
@ -100,4 +104,6 @@ private:
    size_type mSize;
 };
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/assert.h
+++ b/cpp/include/tensorrt_llm/common/assert.h
@ -16,14 +16,19 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/tllmException.h"
 TRTLLM_NAMESPACE_BEGIN
 class DebugConfig
 {
 public:
    static bool isCheckDebugEnabled();
 };
 TRTLLM_NAMESPACE_END
 #if defined(_WIN32)
 #define TLLM_LIKELY(x) (__assume((x) == 1), (x))
 #define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
@ -35,8 +40,8 @@ public:
 #define TLLM_CHECK(val)                                                                                                \
    do                                                                                                                 \
    {                                                                                                                  \
-        TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                               \
+        TLLM_LIKELY(static_cast<bool>(val))                                                                            \
-                                            : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);       \
+        ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);                              \
    } while (0)
 #define TLLM_CHECK_WITH_INFO(val, info, ...)                                                                           \
@ -51,17 +56,17 @@ public:
 #define TLLM_CHECK_DEBUG(val)                                                                                          \
    do                                                                                                                 \
    {                                                                                                                  \
-        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled()))                                           \
        {                                                                                                              \
-            TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                           \
+            TLLM_LIKELY(static_cast<bool>(val))                                                                        \
-                                                : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);   \
+            ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);                          \
        }                                                                                                              \
    } while (0)
 #define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...)                                                                     \
    do                                                                                                                 \
    {                                                                                                                  \
-        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled()))                                           \
        {                                                                                                              \
            TLLM_LIKELY(static_cast<bool>(val))                                                                        \
            ? ((void) 0)                                                                                               \
--- a/cpp/include/tensorrt_llm/common/bindingUtils.h
+++ b/cpp/include/tensorrt_llm/common/bindingUtils.h
@ -17,9 +17,13 @@
 #pragma once
 #include "c10/util/intrusive_ptr.h"
 #include "tensorrt_llm/common/config.h"
 #include <Python.h>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 // Adapted from pybind11's example implementation:
@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
    return c10::intrusive_ptr<T>::reclaim_copy(p);
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/config.h
+++ b/cpp/include/tensorrt_llm/common/config.h
@ -0,0 +1,62 @@
 /*
 * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #ifndef TRTLLM_CONFIG_H
 #define TRTLLM_CONFIG_H
 /**
 * \def TRTLLM_ABI_NAMESPACE
 * This macro is used to open an implicitly inline namespace block for the ABI version.
 * This macro can be overridden to change the ABI version.
 * The default ABI version is _v1.
 */
 #ifndef TRTLLM_ABI_NAMESPACE
 #define TRTLLM_ABI_NAMESPACE _v1
 #endif
 #ifndef TRTLLM_ABI_NAMESPACE_BEGIN
 #define TRTLLM_ABI_NAMESPACE_BEGIN                                                                                     \
    inline namespace TRTLLM_ABI_NAMESPACE                                                                              \
    {
 #endif
 #ifndef TRTLLM_ABI_NAMESPACE_END
 #define TRTLLM_ABI_NAMESPACE_END }
 #endif
 /**
 * \def TRTLLM_NAMESPACE_BEGIN
 * This macro is used to open a `tensorrt_llm::` namespace block, along with any
 * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
 * This macro is defined by TensorRT-LLM and may not be overridden.
 */
 #define TRTLLM_NAMESPACE_BEGIN                                                                                         \
    namespace tensorrt_llm                                                                                             \
    {                                                                                                                  \
    TRTLLM_ABI_NAMESPACE_BEGIN
 /**
 * \def TRTLLM_NAMESPACE_END
 * This macro is used to close a `tensorrt_llm::` namespace block, along with any
 * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
 * This macro is defined by TensorRT-LLM and may not be overridden.
 */
 #define TRTLLM_NAMESPACE_END                                                                                           \
    TRTLLM_ABI_NAMESPACE_END                                                                                           \
    }  /* end namespace tensorrt_llm */
 #endif // TRTLLM_CONFIG_H
--- a/cpp/include/tensorrt_llm/common/cudaFp8Utils.h
+++ b/cpp/include/tensorrt_llm/common/cudaFp8Utils.h
@ -16,6 +16,8 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #ifdef ENABLE_FP8
 #include <cuda_fp8.h>
 #include <cuda_runtime.h>
@ -29,8 +31,8 @@
 #define USE_QGMMA
 #endif
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
    const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
 #endif // ENABLE_FP8
--- a/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h
+++ b/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h
@ -14,12 +14,18 @@
 * limitations under the License.
 */
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <cstdint>
 #include <optional>
 #include <string>
 #include <unordered_set>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 /// @brief Populate the start and end profiling iteration indexes from the provided environment variables
@ -28,4 +34,6 @@ namespace tensorrt_llm::common
 std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
    std::string const& envVarName, std::optional<std::string> const& legacyEnvVarName = std::nullopt);
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/cudaUtils.h
+++ b/cpp/include/tensorrt_llm/common/cudaUtils.h
@ -16,6 +16,7 @@
 */
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
@ -49,7 +50,9 @@
               // this undef.
 #endif         // WIN32
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 // workspace for cublas gemm : 32MB
@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq)
 DEFINE_MEMBER_CHECKER(qua)
 DEFINE_MEMBER_CHECKER(high_preciecion_normed_output)
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
 /*
 * Macros compliant with TensorRT coding conventions
--- a/cpp/include/tensorrt_llm/common/dataType.h
+++ b/cpp/include/tensorrt_llm/common/dataType.h
@ -16,11 +16,15 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/tllmException.h"
 #include <NvInferRuntime.h>
 #include <map>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 constexpr static size_t getDTypeSize(nvinfer1::DataType type)
@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type)
    return "";
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/logger.h
+++ b/cpp/include/tensorrt_llm/common/logger.h
@ -22,9 +22,12 @@
 #include <string>
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 class Logger
@ -125,12 +128,12 @@ private:
    static inline std::string getPrefix(Level const level)
    {
-        return fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
+        return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level));
    }
    static inline std::string getPrefix(Level const level, int const rank)
    {
-        return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
+        return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank);
    }
 };
@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
        out << std::endl;
    }
 }
 } // namespace common
 TRTLLM_NAMESPACE_END
 #define TLLM_LOG(level, ...)                                                                                           \
    do                                                                                                                 \
@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format,
 #define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__)
 #define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__)
 #define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__)
 } // namespace tensorrt_llm::common
--- a/cpp/include/tensorrt_llm/common/optionalRef.h
+++ b/cpp/include/tensorrt_llm/common/optionalRef.h
@ -16,11 +16,15 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <functional>
 #include <memory>
 #include <optional>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 /**
@ -100,4 +104,6 @@ public:
    }
 };
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/quantization.h
+++ b/cpp/include/tensorrt_llm/common/quantization.h
@ -16,12 +16,14 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <cstdint>
 #include <optional>
 #include <string>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -480,4 +482,5 @@ public:
 };
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/stringUtils.h
+++ b/cpp/include/tensorrt_llm/common/stringUtils.h
@ -16,6 +16,7 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #if ENABLE_BF16
 #include <cuda_bf16.h>
 #endif // ENABLE_BF16
@ -28,7 +29,9 @@
 #include <unordered_set>
 #include <vector>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 #if ENABLE_BF16
 static inline std::basic_ostream<char>& operator<<(std::basic_ostream<char>& stream, __nv_bfloat16 const& val)
@ -228,4 +231,6 @@ inline void toUpper(std::string& s)
    }
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/tllmException.h
+++ b/cpp/include/tensorrt_llm/common/tllmException.h
@ -16,6 +16,7 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include <array>
@ -41,7 +42,9 @@
    tensorrt_llm::common::RequestSpecificException(                                                                    \
        __FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode)
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 /// @brief Enumeration of different error codes for request-specific exceptions
@ -77,7 +80,8 @@ private:
 [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info)
 {
-    throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
+    throw TllmException(
        file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str());
 }
 [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "")
@ -102,4 +106,6 @@ private:
    RequestErrorCode mErrorCode;
 };
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/common/utils.h
+++ b/cpp/include/tensorrt_llm/common/utils.h
@ -16,6 +16,8 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <algorithm>
 #include <initializer_list>
 #include <string>
@ -24,7 +26,9 @@
 #include <pthread.h>
 #endif
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 inline bool setThreadName(std::string const& name)
@ -43,4 +47,6 @@ bool contains(std::initializer_list<T> const& c, T const& v)
    return std::find(c.begin(), c.end(), v) != c.end();
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/kernels/archCondition.h
+++ b/cpp/include/tensorrt_llm/kernels/archCondition.h
@ -16,7 +16,11 @@
 #pragma once
-namespace tensorrt_llm::kernels
+#include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 namespace detail
@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible<Arch>::value;
 } // namespace arch
-} // namespace tensorrt_llm::kernels
+} // namespace kernels
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/kernels/decodingCommon.h
+++ b/cpp/include/tensorrt_llm/kernels/decodingCommon.h
@ -17,11 +17,14 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/executor/types.h"
 #include <cstdint>
 #include <curand_kernel.h>
-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 class FinishedState
@ -308,4 +311,6 @@ template <typename T>
 void invokeScatterDecodingParams(
    T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream);
-} // namespace tensorrt_llm::kernels
+} // namespace kernels
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h
+++ b/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h
@ -17,11 +17,14 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include <cstdint>
 #include <cuda_runtime.h>
-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 class KVCacheIndex
@ -53,4 +56,6 @@ private:
    UnderlyingType value;
 };
-} // namespace tensorrt_llm::kernels
+} // namespace kernels
 TRTLLM_NAMESPACE_END
--- a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h
+++ b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h
@ -14,16 +14,18 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 using namespace tensorrt_llm::runtime;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
 void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads,
    unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor,
    cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/kernels/fmha_v2/setup.py
+++ b/cpp/kernels/fmha_v2/setup.py
@ -2175,7 +2175,8 @@ def get_kernel_code(kspec, kname, lname):
    params_str = 'reinterpret_cast<bert::Fused_multihead_attention_params_v2 &>(params)' if generate_cu_trtllm else 'params'
    attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;'
    bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;'
-    include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else ''
+    include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else ''
    include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else ''
    num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;'
    fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}'
    const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}'
@ -2201,8 +2202,19 @@ def get_kernel_code(kspec, kname, lname):
        const int COMPUTE_REG_COUNT = {compute_reg_count};
        asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format(
        compute_reg_count=compute_reg_count)
-    local_ns_open = ns_open if generate_cu_trtllm else ''
+    abi_ns_open = r"""
-    local_ns_close = ns_close if generate_cu_trtllm else ''
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 // clang-format off
 """
    abi_ns_close = r"""
 // clang-format on
 } // namespace kernels
 TRTLLM_NAMESPACE_END
 """
    local_ns_open = abi_ns_open if generate_cu_trtllm else ''
    local_ns_close = abi_ns_close if generate_cu_trtllm else ''
    tmp = dict(locals(), **kspec._asdict())
@ -3077,8 +3089,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None):
 def get_cubin_header(kernel_traits, specs_names):
    cubins = []
    cubin_lens = []
    launchers = []
    cubins_dict = {}
    cubin_lens_dict = {}
    launchers_dict = {}
    for kspec, fname, lname, kname in specs_names:
        if generate_cu_trtllm and not use_cubin_header(
                kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype):
@ -3282,11 +3296,11 @@ def get_cubin_header(kernel_traits, specs_names):
            if generate_cu_trtllm and lname != 'nullptr':
                launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format(
                    lname=lname)
-                if int(sm) in cubins_dict:
+                if int(sm) in launchers_dict:
-                    if launcher not in cubins_dict[int(sm)]:
+                    if launcher not in launchers_dict[int(sm)]:
-                        cubins_dict[int(sm)].append(launcher)
+                        launchers_dict[int(sm)].append(launcher)
                else:
-                    cubins_dict[int(sm)] = [launcher]
+                    launchers_dict[int(sm)] = [launcher]
        elif 'mhca' in kname:
            code = '''\
 {{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm},  {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\
@ -3309,17 +3323,33 @@ def get_cubin_header(kernel_traits, specs_names):
    else:
        metadata_v2 = ',\n'.join(metadata_v2)
    # Add macros to only include needed cubins during compilation.
-    for sm in cubins_dict.keys():
+    # Collect all SM versions from all dictionaries
    all_sms = sorted(
        set(
            list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) +
            list(launchers_dict.keys())))
    for sm in all_sms:
        macro_begin = f"#ifndef EXCLUDE_SM_{sm}"
        macro_end = f"#endif\n"
        # Add cubin array declarations
        if sm in cubins_dict:
            cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end])
        # Add cubin length declarations
        if sm in cubin_lens_dict:
            cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end])
        # Add launcher declarations
        if sm in launchers_dict:
            launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end])
    unroll_config_v1 = ',\n'.join(unroll_config_v1)
    unroll_config_v2 = ',\n'.join(unroll_config_v2)
    cubins = '\n'.join(cubins)
    cubin_lens = '\n'.join(cubin_lens)
    launchers = '\n'.join(launchers)
    local_ns_open = ns_open
    local_ns_close = ns_close if generate_cu_trtllm else '}'
    launcher_line = '''
@ -3431,7 +3461,157 @@ static const struct TestMetaV2
 '''.format(**locals(), copyright=copyright)
-    return code
+    # Generate header content (.h file)
    if "GENERATE_CUBIN" in os.environ:
        header_content = '''\
 {copyright}
 #pragma once
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace kernels{{
 struct FusedMultiHeadAttentionKernelMetaInfoV2
 {{
    Data_type mDataTypeIn;
    Data_type mDataTypeOut;
    unsigned int mS;
    unsigned int mStepQ;
    unsigned int mStepKV;
    unsigned int mD;
    unsigned int mDV;
    unsigned int mSageBlockSizeQ;
    unsigned int mSageBlockSizeK;
    unsigned int mSageBlockSizeV;
    unsigned int mSM;
    const unsigned char* mCubin;
    unsigned int mCubinSize;
    const char* mFuncName;
    unsigned int mSharedMemBytes;
    unsigned int mThreadsPerCTA;
    unsigned int mUnrollStep;
    int mAttentionMaskType;
    int mAttentionInputLayout;
    bool mInterleaved;
    bool mFlashAttention;
    bool mWarpSpecialization;
    bool mFP32Accumulation;
    bool mAlibiSupported;
    bool mTiled;
    bool mEnableAttnLogitSoftcapping;
    bool mReturnSoftmaxStats;{launcher_line}
 }};
 extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[];
 extern const int sMhaKernelMetaInfosV2Size;
 }} // namespace kernels
 TRTLLM_NAMESPACE_END
 '''.format(**locals(), copyright=copyright)
        # Generate source content (.cpp file)
        source_content = '''\
 {copyright}
 #include "tensorrt_llm/common/config.h"
 #include <cstddef>
 #include <cstdint>
 #include <cuda_runtime_api.h>
 {local_ns_open}
 //--- Cubin Arrays
 {cubins}
 //--- Cubin Lengths
 {cubin_lens}
 {local_ns_close}
 using namespace tensorrt_llm::kernels;
 namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{
 class Fused_multihead_attention_params_v2;
 class Launch_params;
 //--- Kernel Launchers
 {launchers}
 // FIXME: These are duplicated declarations, we should remove them in the future.
 constexpr int32_t kSM_70 = 70;
 constexpr int32_t kSM_72 = 72;
 constexpr int32_t kSM_75 = 75;
 constexpr int32_t kSM_80 = 80;
 constexpr int32_t kSM_86 = 86;
 constexpr int32_t kSM_89 = 89;
 constexpr int32_t kSM_90 = 90;
 constexpr int32_t kSM_100 = 100;
 constexpr int32_t kSM_100f = 10100;
 constexpr int32_t kSM_103 = 103;
 constexpr int32_t kSM_120 = 120;
 constexpr int32_t kSM_121 = 121;
 // FIXME: These are duplicated declarations, we should remove them in the future.
 enum Data_type
 {{
    DATA_TYPE_BOOL,
    DATA_TYPE_FP16,
    DATA_TYPE_FP32,
    DATA_TYPE_INT4,
    DATA_TYPE_INT8,
    DATA_TYPE_INT32,
    DATA_TYPE_BF16,
    DATA_TYPE_E2M1,
    DATA_TYPE_E4M3,
    DATA_TYPE_E5M2
 }};
 struct FusedMultiHeadAttentionKernelMetaInfoV2
 {{
    Data_type mDataTypeIn;
    Data_type mDataTypeOut;
    unsigned int mS;
    unsigned int mStepQ;
    unsigned int mStepKV;
    unsigned int mD;
    unsigned int mDV;
    unsigned int mSageBlockSizeQ;
    unsigned int mSageBlockSizeK;
    unsigned int mSageBlockSizeV;
    unsigned int mSM;
    const unsigned char* mCubin;
    unsigned int mCubinSize;
    const char* mFuncName;
    unsigned int mSharedMemBytes;
    unsigned int mThreadsPerCTA;
    unsigned int mUnrollStep;
    int mAttentionMaskType;
    int mAttentionInputLayout;
    bool mInterleaved;
    bool mFlashAttention;
    bool mWarpSpecialization;
    bool mFP32Accumulation;
    bool mAlibiSupported;
    bool mTiled;
    bool mEnableAttnLogitSoftcapping;
    bool mReturnSoftmaxStats;{launcher_line}
 }};
 extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{
 {metadata_v2}
 }};
 extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]);
 }} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels
 '''.format(**locals(), copyright=copyright)
    else:
        # Non-GENERATE_CUBIN mode: use old behavior
        header_content = code
        source_content = None
    return header_content, source_content
 # This is used to add some kernels running in cubins for passing CI cases.
@ -3449,9 +3629,20 @@ def modify_cubin_header(cubin_header):
        return result
    target = "#ifndef EXCLUDE_SM_80"
-    addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
+    addition_cubin_array = """
-extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;"""
+#ifndef EXCLUDE_SM_80
-    result = add_kernel_line(result, target, addition)
+extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[];
 #endif
 """
    addition_cubin_length = """
 #ifndef EXCLUDE_SM_80
 extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;
 #endif
 """
    # Add cubin array and length into there corresponding sections.
    result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array)
    result = add_kernel_line(result, "//--- Cubin Lengths",
                             addition_cubin_length)
    def modify_kernel_line(result, target, new_line):
        lines = result.split('\n')
@ -3534,13 +3725,22 @@ def generate_files(specs_names):
    output = output.decode('utf-8').strip()
    # this gives: kname, smem bytes, threads_per_cta, loop_step
    kernel_traits = [traits.split() for traits in output.splitlines()]
-    cubin_header = get_cubin_header(kernel_traits, valid_specs_names)
+    # Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files
    # To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header
    cubin_header, cubin_source = get_cubin_header(kernel_traits,
                                                  valid_specs_names)
    if generate_cu_trtllm:
-        cubin_header = modify_cubin_header(cubin_header)
+        cubin_source = modify_cubin_header(cubin_source)
    # Write fmha_cubin.h file
    with open('./generated/fmha_cubin.h', 'w') as f:
        f.write(cubin_header)
    # Write fmha_cubin.cpp file (same directory as fmha_cubin.h file)
    if cubin_source is not None:
        with open('./generated/fmha_cubin.cpp', 'w') as f:
            f.write(cubin_source)
 def enumerate_hgmma_tma_kernels(specs, sm=90):
    specs.append(
--- a/cpp/kernels/xqa/gen_cpp_header.py
+++ b/cpp/kernels/xqa/gen_cpp_header.py
@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/*
 */
 #pragma once
-namespace tensorrt_llm {
+#include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace kernels {
 '''
@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}";
 '''
 TEMPLATE_EPILOGUE = '''}
-}
+TRTLLM_NAMESPACE_END
 '''
 D = defaultdict(list)
--- a/cpp/kernels/xqa/gen_cubins.py
+++ b/cpp/kernels/xqa/gen_cubins.py
@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/*
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-namespace tensorrt_llm
+
-{
+#include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 // clang-format off
@ -96,7 +98,7 @@ namespace kernels
 cpp_file_suffex_text = R"""
 // clang-format on
 } // namespace kernels
-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
 """
 cubin_meta_info_struct_prefix_text = R"""
--- a/cpp/tensorrt_llm/common/assert.cpp
+++ b/cpp/tensorrt_llm/common/assert.cpp
@ -27,7 +27,7 @@ bool initCheckDebug()
 }
 } // namespace
-bool DebugConfig::isCheckDebugEnabled()
+bool tensorrt_llm::DebugConfig::isCheckDebugEnabled()
 {
    static bool const debugEnabled = initCheckDebug();
    return debugEnabled;
--- a/cpp/tensorrt_llm/common/attentionOp.cpp
+++ b/cpp/tensorrt_llm/common/attentionOp.cpp
@ -16,6 +16,7 @@
 */
 #include "attentionOp.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/memoryUtils.h"
--- a/cpp/tensorrt_llm/common/attentionOp.h
+++ b/cpp/tensorrt_llm/common/attentionOp.h
@ -16,6 +16,7 @@
 */
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/opUtils.h"
 #include "tensorrt_llm/common/quantization.h"
@ -36,7 +37,9 @@
 #include <nccl.h>
 #endif // ENABLE_MULTI_DEVICE
-namespace tensorrt_llm::common::op
+TRTLLM_NAMESPACE_BEGIN
 namespace common::op
 {
 class AttentionOp
@ -543,4 +546,6 @@ private:
    UniqPtrWNullCopy<int32_t[], Deleter> mMultiBlockSemaphores = {};
 };
-} // namespace tensorrt_llm::common::op
+} // namespace common::op
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp
@ -16,6 +16,7 @@
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cublasVersionCheck.h"
 #include <algorithm>
 #include <unordered_map>
@ -24,8 +25,8 @@
 #error CUDART_VERSION Undefined!
 #endif
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t
 } // namespace common
-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cublasMMWrapper.h
+++ b/cpp/tensorrt_llm/common/cublasMMWrapper.h
@ -16,6 +16,7 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <cublasLt.h>
 #include <cublas_v2.h>
@ -24,8 +25,8 @@
 #include <optional>
 #include <string>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -185,4 +186,4 @@ public:
 } // namespace common
-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh
+++ b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh
@ -16,12 +16,13 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
 #endif // ENABLE_BF16
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
 // Operator definitions intentionally in global namespace
 namespace
--- a/cpp/tensorrt_llm/common/cudaBufferUtils.cuh
+++ b/cpp/tensorrt_llm/common/cudaBufferUtils.cuh
@ -16,6 +16,7 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <assert.h>
 #include <cstdlib>
@ -28,8 +29,8 @@
 #include <string>
 #include <type_traits>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
 static __host__ __device__ int hash(int val)
@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer<DEPTH, CTAS_PER_CGA>
 };
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp
@ -18,6 +18,7 @@
 #if defined(_WIN32)
 #include <windows.h>
 #define dllOpen(name) LoadLibrary("nv" name ".dll")
 #define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
 #define dllGetSym(handle, name) static_cast<void*>(GetProcAddress(static_cast<HMODULE>(handle), name))
@ -29,6 +30,7 @@
 #endif // defined(_WIN32)
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/logger.h"
 #include <cuda.h>
@ -36,7 +38,9 @@
 #include <cstdio>
 #include <mutex>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 std::shared_ptr<CUDADriverWrapper> CUDADriverWrapper::getInstance()
@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters(
    return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config);
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h
+++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h
@ -17,6 +17,7 @@
 #ifndef CUDA_DRIVER_WRAPPER_H
 #define CUDA_DRIVER_WRAPPER_H
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/tllmException.h"
@ -25,7 +26,9 @@
 #include <cstdio>
 #include <memory>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 class CUDADriverWrapper
@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil
    }
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
 /*
 * Macros compliant with TensorRT coding conventions
 */
--- a/cpp/tensorrt_llm/common/cudaFp8Utils.cu
+++ b/cpp/tensorrt_llm/common/cudaFp8Utils.cu
@ -14,6 +14,7 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
@ -24,8 +25,8 @@
 #include <limits>
 #include <type_traits>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
 #ifdef ENABLE_FP8
@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3);
 #endif // ENABLE_FP8
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp
+++ b/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp
@ -15,6 +15,7 @@
 */
 #include "tensorrt_llm/common/cudaProfilerUtils.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include <cstdint>
@ -54,7 +55,9 @@ std::tuple<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIte
 } // namespace
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIterationIndexes(
@ -81,4 +84,6 @@ std::pair<std::unordered_set<int32_t>, std::unordered_set<int32_t>> populateIter
    return std::make_pair(profileIterIdxs, stopIterIdxs);
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
+++ b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
@ -25,9 +25,10 @@
 #if ENABLE_BF16
 #include <cuda_bf16.h>
 #endif
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace common
 {
@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
 #endif // ENABLE_FP8
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@ -16,6 +16,7 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/kernels/customAllReduceKernels.h"
@ -25,7 +26,9 @@
 using tensorrt_llm::kernels::AllReduceFusionOp;
 using tensorrt_llm::kernels::AllReduceStrategyType;
-namespace tensorrt_llm::utils::customAllReduceUtils
+TRTLLM_NAMESPACE_BEGIN
 namespace utils::customAllReduceUtils
 {
 constexpr size_t NUM_POINTERS_PER_RANK = 7;
@ -292,4 +295,6 @@ inline const std::unordered_map<int, AllReduceBestStrategyTableType> AllReduceBe
    {90, AllReduceBestStrategyTableSM90},
    {100, AllReduceBestStrategyTableSM100},
 };
-} // namespace tensorrt_llm::utils::customAllReduceUtils
+} // namespace utils::customAllReduceUtils
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@ -16,6 +16,7 @@
 */
 #include "envUtils.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/stringUtils.h"
@ -25,7 +26,9 @@
 #include <optional>
 #include <string>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 std::optional<int32_t> getIntEnv(char const* name)
@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy()
    return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@ -16,13 +16,16 @@
 */
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <optional>
 #include <string>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 // Useful when you want to inject some debug code controllable with env var.
 std::optional<int32_t> getIntEnv(char const* name);
@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow();
 bool getEnvEplbForceGdrcopy();
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/lamportUtils.cuh
+++ b/cpp/tensorrt_llm/common/lamportUtils.cuh
@ -19,6 +19,7 @@
 #ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH
 #define TRTLLM_CUDA_LAMPORT_UTILS_CUH
 #include "tensorrt_llm/common/config.h"
 #include <array>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
@ -29,7 +30,9 @@
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 constexpr uint16_t kNEGZERO_FP16 = 0x8000U;
@ -279,6 +282,7 @@ private:
    }
 };
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
 #endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH
--- a/cpp/tensorrt_llm/common/logger.cpp
+++ b/cpp/tensorrt_llm/common/logger.cpp
@ -15,12 +15,15 @@
 */
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/tllmException.h"
 #include <cuda_runtime.h>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 Logger::Logger()
@ -70,4 +73,6 @@ Logger* Logger::getLogger()
    thread_local Logger instance;
    return &instance;
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/mathUtils.h
+++ b/cpp/tensorrt_llm/common/mathUtils.h
@ -16,10 +16,11 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <cuda_runtime.h>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp
+++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp
@ -14,11 +14,15 @@
 * limitations under the License.
 */
 #include "mcastDevMemUtils.h"
 #include "tensorrt_llm/common/config.h"
 #include <unordered_map>
-namespace tensorrt_llm::common
+using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory;
 TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
 namespace
 {
@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr)
 {
    return McastDevMemBufferRegistry::getInstance().findBuffer(ptr);
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/mcastDevMemUtils.h
+++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.h
@ -15,13 +15,17 @@
 */
 #pragma once
-// Avoid circular dependency
+#include "tensorrt_llm/common/config.h"
 namespace tensorrt_llm::runtime
 {
 class McastDeviceMemory;
-}
+} // namespace tensorrt_llm::runtime
-namespace tensorrt_llm::common
+// Avoid circular dependency
 TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory;
 // Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer!
@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf);
 // information. Thus a derived pointer cannot used as the key.
 McastDeviceMemory* findMcastDevMemBuffer(void* ptr);
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/memoryUtils.cu
+++ b/cpp/tensorrt_llm/common/memoryUtils.cu
@ -15,6 +15,7 @@
 */
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/memoryUtils.h"
@ -25,8 +26,8 @@
 #include <sanitizer/asan_interface.h>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -961,4 +962,5 @@ void calcAlignedPointers(
 }
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/memoryUtils.h
+++ b/cpp/tensorrt_llm/common/memoryUtils.h
@ -16,13 +16,14 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <cassert>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers(
 }
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/ncclUtils.h
+++ b/cpp/tensorrt_llm/common/ncclUtils.h
@ -16,6 +16,7 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
@ -46,7 +47,9 @@
 #include <dlfcn.h>
 #endif
-namespace tensorrt_llm::common::nccl_util
+TRTLLM_NAMESPACE_BEGIN
 namespace common::nccl_util
 {
 //==============================================================================
@ -392,6 +395,8 @@ inline std::pair<torch::Tensor, NCCLWindowBuffer> createNCCLWindowTensor(
    return std::make_pair(tensor, buffer);
 }
-} // namespace tensorrt_llm::common::nccl_util
+} // namespace common::nccl_util
 TRTLLM_NAMESPACE_END
 #endif // ENABLE_MULTI_DEVICE
--- a/cpp/tensorrt_llm/common/nvtxUtils.h
+++ b/cpp/tensorrt_llm/common/nvtxUtils.h
@ -25,10 +25,13 @@
 #if defined(__clang__)
 #pragma clang diagnostic pop
 #endif
 #include "tensorrt_llm/common/config.h"
 #include <array>
-namespace tensorrt_llm::common::nvtx
+TRTLLM_NAMESPACE_BEGIN
 namespace common::nvtx
 {
 inline nvtx3::color nextColor()
 {
@ -46,8 +49,9 @@ inline nvtx3::color nextColor()
 #endif
 }
-} // namespace tensorrt_llm::common::nvtx
+} // namespace common::nvtx
 TRTLLM_NAMESPACE_END
 #define NVTX3_SCOPED_RANGE_WITH_NAME(range, name)                                                                      \
    ::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name)
 #define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range)
--- a/cpp/tensorrt_llm/common/opUtils.cpp
+++ b/cpp/tensorrt_llm/common/opUtils.cpp
@ -29,6 +29,7 @@
 #include <mutex>
 #include <thread>
 TRTLLM_NAMESPACE_BEGIN
 #if ENABLE_MULTI_DEVICE
 std::unordered_map<nvinfer1::DataType, ncclDataType_t>* getDtypeMap()
@ -378,3 +379,5 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
        });
    return creator();
 }
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/opUtils.h
+++ b/cpp/tensorrt_llm/common/opUtils.h
@ -17,6 +17,7 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/workspace.h"
@ -37,7 +38,9 @@
 #include <string>
 #include <unordered_map>
-namespace tensorrt_llm::common::op
+TRTLLM_NAMESPACE_BEGIN
 namespace common::op
 {
 // Write values into buffer
@ -178,7 +181,7 @@ struct hash
 // for testing only
 void const* getCommSessionHandle();
-} // namespace tensorrt_llm::common::op
+} // namespace common::op
 inline bool isBuilding()
 {
@ -220,6 +223,8 @@ std::shared_ptr<ncclComm_t> getComm(std::set<int> const& group);
 std::shared_ptr<cublasHandle_t> getCublasHandle();
 std::shared_ptr<cublasLtHandle_t> getCublasLtHandle();
 TRTLLM_NAMESPACE_END
 #ifndef DEBUG
 #define PLUGIN_CHECK(status)                                                                                           \
--- a/cpp/tensorrt_llm/common/quantTypeUtils.cuh
+++ b/cpp/tensorrt_llm/common/quantTypeUtils.cuh
@ -16,14 +16,15 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <float.h>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3>
 #endif // ENABLE_FP8
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
+++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
@ -21,6 +21,7 @@
 #else
 #include <cooperative_groups.h>
 #endif
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@ -30,8 +31,8 @@
 namespace cg = cooperative_groups;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace common
 {
@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input)
 }
 } // namespace common
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/safetensors.cpp
+++ b/cpp/tensorrt_llm/common/safetensors.cpp
@ -17,6 +17,7 @@
 #include "safetensors.h"
 #include "nlohmann/json.hpp"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include <NvInferRuntime.h>
 #include <cstdint>
 #include <fstream>
@ -25,7 +26,9 @@
 #include <utility>
 #include <vector>
-namespace tensorrt_llm::common::safetensors
+TRTLLM_NAMESPACE_BEGIN
 namespace common::safetensors
 {
 using nvinfer1::DataType;
@ -164,4 +167,6 @@ std::shared_ptr<ISafeTensor> ISafeTensor::open(char const* filename)
 {
    return std::make_shared<SafeTensor>(filename);
 }
-} // namespace tensorrt_llm::common::safetensors
+} // namespace common::safetensors
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/safetensors.h
+++ b/cpp/tensorrt_llm/common/safetensors.h
@ -16,6 +16,7 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/logger.h"
 #include <NvInferRuntime.h>
 #include <cstdint>
@ -23,7 +24,9 @@
 #include <memory>
 #include <utility>
-namespace tensorrt_llm::common::safetensors
+TRTLLM_NAMESPACE_BEGIN
 namespace common::safetensors
 {
 class INdArray
 {
@ -58,4 +61,6 @@ public:
    virtual ~ISafeTensor() = default;
 };
-} // namespace tensorrt_llm::common::safetensors
+} // namespace common::safetensors
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/stlUtils.h
+++ b/cpp/tensorrt_llm/common/stlUtils.h
@ -16,12 +16,15 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <functional>
 #include <numeric>
 #include <optional>
 #include <sstream>
-namespace tensorrt_llm::common::stl_utils
+TRTLLM_NAMESPACE_BEGIN
 namespace common::stl_utils
 {
 template <typename TInputIt, typename TOutputIt, typename TBinOp>
@ -120,4 +123,6 @@ std::string toString(std::optional<T> const& t, typename std::enable_if_t<HasOpe
    return oss.str();
 }
-} // namespace tensorrt_llm::common::stl_utils
+} // namespace common::stl_utils
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/stringUtils.cpp
+++ b/cpp/tensorrt_llm/common/stringUtils.cpp
@ -16,6 +16,7 @@
 #include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include <cerrno>
 #include <cstdarg>
@ -23,7 +24,9 @@
 #include <iostream>
 #include <string>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args)
@ -73,4 +76,6 @@ std::unordered_set<std::string> str2set(std::string const& input, char delimiter
    return values;
 };
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/timestampUtils.cpp
+++ b/cpp/tensorrt_llm/common/timestampUtils.cpp
@ -14,13 +14,16 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include <chrono>
 #include <iomanip>
 #include <sstream>
 #include "tensorrt_llm/common/timestampUtils.h"
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 std::string getCurrentTimestamp()
@ -39,4 +42,6 @@ std::string getCurrentTimestamp()
    return stream.str();
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/timestampUtils.h
+++ b/cpp/tensorrt_llm/common/timestampUtils.h
@ -14,12 +14,17 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include <string>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 /// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu"
 std::string getCurrentTimestamp();
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/tllmException.cpp
+++ b/cpp/tensorrt_llm/common/tllmException.cpp
@ -15,6 +15,7 @@
 */
 #include "tensorrt_llm/common/tllmException.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/stringUtils.h"
 #include <cinttypes>
@ -26,7 +27,9 @@
 #endif
 #include <sstream>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 namespace
@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept
    return mErrorCode;
 }
-} // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/workspace.h
+++ b/cpp/tensorrt_llm/common/workspace.h
@ -14,10 +14,13 @@
 * limitations under the License.
 */
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <cstddef>
 #include <cstdint>
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
 namespace common
 {
 // CuBLAS >= 12.9.1 requires 256-byte alignment.
@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize(
    return total;
 }
-}; // namespace tensorrt_llm::common
+} // namespace common
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h
@ -18,10 +18,11 @@
 #include <cuda_runtime_api.h>
 #include "cutlass/device_kernel.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace cutlass_extensions
 {
@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel()
 }
 } // namespace cutlass_extensions
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h
@ -30,10 +30,11 @@
 #include "cutlass/epilogue/thread/linear_combination_relu.h"
 #include "cutlass/epilogue/thread/linear_combination_silu.h"
 #include "cutlass_extensions/epilogue/thread/fused_activations.h"
 #include "tensorrt_llm/common/config.h"
 #include <cutlass/epilogue/fusion/operations.hpp>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace cutlass_extensions
 {
@ -150,4 +151,5 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
 };
 } // namespace cutlass_extensions
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@ -24,10 +24,11 @@
 #include "cute/tensor.hpp"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/tllmException.h"
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace cutlass_extensions
 {
@ -535,4 +536,5 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
 }
 } // namespace cutlass_extensions
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/executor/executorImpl.cpp
+++ b/cpp/tensorrt_llm/executor/executorImpl.cpp
@ -52,7 +52,8 @@ namespace tensorrt_llm::executor
 namespace
 {
-[[nodiscard]] bool executorConfigIsValid(ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
+[[nodiscard]] bool executorConfigIsValid(
    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
 {
    // Make sure logic in this function matches fixExecutorConfig
    if (executorConfig.getEnableChunkedContext())
@ -65,8 +66,8 @@ namespace
    return true;
 }
-[[nodiscard]] ExecutorConfig fixExecutorConfig(
+[[nodiscard]] ::tensorrt_llm::executor::ExecutorConfig fixExecutorConfig(
-    ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, runtime::ModelConfig const& modelConfig)
 {
    // Make sure logic in this function matches executorConfigIsValid
    auto fixedExecutorConfig = executorConfig;
@ -241,7 +242,7 @@ private:
 void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& modelPathOpt,
    std::optional<BufferView> const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig,
-    ExecutorConfig const& executorConfig, bool isEncoder,
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder,
    std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
 {
    auto const gpusPerNode = jsonConfig.getGpusPerNode();
@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional<std::filesystem::path> const& model
 Executor::Impl::Impl(std::filesystem::path const& modelPath,
    std::optional<std::filesystem::path> const& encoderModelPath, ModelType const modelType,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json");
@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath,
 Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr,
    std::optional<BufferView> const& encoderEngineBufferView, std::optional<std::string> const& encoderJsonConfigStr,
-    ModelType const modelType, ExecutorConfig const& executorConfig,
+    ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig,
    std::optional<std::map<std::string, Tensor>> const& managedWeightsOpt)
 {
    auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr);
@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json
 }
 Executor::Impl::Impl(std::shared_ptr<Model> model, std::optional<std::shared_ptr<Model>> encoderModel,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto const& worldConfig = model->getWorldConfig();
    auto const tp = worldConfig.getTensorParallelism();
@ -388,7 +389,7 @@ Executor::Impl::~Impl()
    shutdown();
 }
-void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
+void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig)
 std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& rawEngine,
    runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto const gptModelType = [&executorConfig, &modelConfig]()
    {
@ -512,7 +513,7 @@ std::shared_ptr<Model> Executor::Impl::createModel(runtime::RawEngine const& raw
 std::shared_ptr<Model> Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine,
    runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
-    ExecutorConfig const& executorConfig)
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig)
 {
    auto fixedExecutorConfig = ExecutorConfig{};
    fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig());
@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm(
 }
 void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp,
-    ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional<ModelType> modelType,
    std::optional<std::filesystem::path> const& modelPath, std::optional<runtime::WorldConfig> const& worldConfig,
    std::optional<runtime::GptJsonConfig> const& decoderGptJsonConfig)
 {
@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig
 }
 void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp,
-    ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
+    ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType,
    std::filesystem::path const& modelPath)
 {
 #if ENABLE_MULTI_DEVICE
--- a/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h
+++ b/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h
@ -16,9 +16,12 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache,
@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca
    cudaStream_t stream = 0);
 }
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/IndexerTopK.h
+++ b/cpp/tensorrt_llm/kernels/IndexerTopK.h
@ -17,12 +17,15 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include "tensorrt_llm/common/cudaUtils.h"
-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
    int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con
    int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
    cudaStream_t const stream = 0);
-} // namespace tensorrt_llm::kernels
+} // namespace kernels
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/attentionMask.cu
+++ b/cpp/tensorrt_llm/kernels/attentionMask.cu
@ -15,6 +15,7 @@
 */
 #include "attentionMask.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/common/cudaUtils.h"
@ -24,8 +25,8 @@
 using namespace tensorrt_llm::common;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const&
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/attentionMask.h
+++ b/cpp/tensorrt_llm/kernels/attentionMask.h
@ -15,6 +15,7 @@
 */
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/runtime/iTensor.h"
@ -25,8 +26,8 @@
 namespace tc = tensorrt_llm::common;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -64,4 +65,5 @@ template <typename MaskDataType>
 void invokeBuildAttentionMask(AttentionMaskParams<MaskDataType> const& params, cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/banBadWords.cu
+++ b/cpp/tensorrt_llm/kernels/banBadWords.cu
@ -14,14 +14,15 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/banBadWords.h"
 using namespace tensorrt_llm::common;
 using namespace tensorrt_llm::runtime;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt
    SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/banBadWords.h
+++ b/cpp/tensorrt_llm/kernels/banBadWords.h
@ -16,12 +16,13 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/runtime/common.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr,
    cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu
+++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu
@ -14,14 +14,15 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/banRepeatNgram.h"
 using namespace tensorrt_llm::common;
 using namespace tensorrt_llm::runtime;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -178,4 +179,4 @@ INVOKE_BAN_REPEAT_NGRAM(__nv_bfloat16)
 } // namespace kernels
-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/banRepeatNgram.h
+++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.h
@ -16,13 +16,14 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/runtime/common.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -34,4 +35,5 @@ void invokeBanRepeatNgram(T* logits, runtime::TokenIdType const** output_ids_buf
    runtime::SizeType32 vocab_size_padded, runtime::SizeType32 max_step, cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu
@ -14,13 +14,14 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/beamSearchKernels.h"
 using namespace tensorrt_llm::common;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -355,4 +356,5 @@ template void printLogProbs<float>(float const* x, int const nBS, int const nBMI
 template void printLogProbs<half>(half const* x, int const nBS, int const nBMIn, int const nBM, int const nV);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels.h
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.h
@ -15,6 +15,7 @@
 */
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/kernels/topkLastDim.h" // Air TopK
@ -22,8 +23,8 @@
 #define BEAM_SEARCH_DEBUG 0
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
 static size_t constexpr kMaxBeamWidth = 1024;           // Max beam width supported in TRT-LLM now
@ -88,7 +89,7 @@ struct BeamHypotheses
    // Pointers related to beam search process, they are initialized in those two functions:
    // [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward
    bool* batchDones{nullptr};                      // [BS]             %% self.beam_hyps_is_done               whether a whole batch is finished
-    FinishedState* finished{nullptr};               // [BS*BM], uint8   %% self.finished                        whether and how a beam is finished
+    ::tensorrt_llm::kernels::FinishedState* finished{nullptr};               // [BS*BM], uint8   %% self.finished                        whether and how a beam is finished
    // Pointers for backtrack of the beams, they are relocated in [dynamicDecodeLayer.cpp] DynamicDecodeLayer<T>::prepareIdsPtrs
    int** outputIdsPtr{nullptr};                    // [BS][BM, MSL]    %% self.output_ids
@ -131,11 +132,11 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses&
    runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream);
 __global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
-    FinishedState const* finished, int const* endIds, float const* diversityRates,
+    ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
 __global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
-    FinishedState const* finished, int const* endIds, float const* diversityRates,
+    ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates,
    runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
 __global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS,
@ -219,4 +220,5 @@ void printLogProbs(float const* x, int const nBS, int const nBMIn, int const nBM
 #endif
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 1024, true);
 #endif // FAST_BUILD
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 128, true);
 #endif // FAST_BUILD
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu
@ -15,13 +15,15 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
 // Skip V1 kernels if beam_width > kMaxBeamWidthForV1
 INSTANTIATE_BEAM_SEARCH(float, 16, true);
 INSTANTIATE_BEAM_SEARCH(half, 16, true);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 256, true);
 #endif // FAST_BUILD
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 32, true);
 #endif // FAST_BUILD
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
 INSTANTIATE_BEAM_SEARCH(float, 4, false);
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 4, true);
 INSTANTIATE_BEAM_SEARCH(half, 4, false);
 INSTANTIATE_BEAM_SEARCH(half, 4, true);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 512, true);
 #endif // FAST_BUILD
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 64, true);
 #endif // FAST_BUILD
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu
@ -15,9 +15,10 @@
 */
 #include "beamSearchKernelsTemplate.h"
 #include "tensorrt_llm/common/config.h"
 TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
 INSTANTIATE_BEAM_SEARCH(float, 8, false);
@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 8, true);
 INSTANTIATE_BEAM_SEARCH(half, 8, false);
 INSTANTIATE_BEAM_SEARCH(half, 8, true);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
@ -18,11 +18,13 @@
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11050)
 #include <cub/cub.cuh>
 #else
 #include "3rdparty/cub/cub.cuh"
 #endif
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/reduceKernelUtils.cuh"
 #include "tensorrt_llm/common/stringUtils.h"
@ -31,8 +33,8 @@
 using namespace tensorrt_llm::common;
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -731,4 +733,5 @@ void beamSearchKernelLauncher(
        T const* logProbs, T const* bias, void* workspace, BeamHypotheses& bh, cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu
+++ b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu
@ -14,12 +14,12 @@
 * limitations under the License.
 */
 #include "buildRelativeAttentionBiasKernel.h"
 #include "tensorrt_llm/common/config.h"
 #include <cuda_runtime_api.h>
-#include "buildRelativeAttentionBiasKernel.h"
+TRTLLM_NAMESPACE_BEGIN
 namespace tensorrt_llm
 {
 namespace kernels
 {
@ -99,4 +99,5 @@ template void invokeBuildRelativeAttentionBias<__nv_bfloat16>(__nv_bfloat16* rel
 #endif
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h
+++ b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h
@ -17,10 +17,11 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
-namespace tensorrt_llm
+TRTLLM_NAMESPACE_BEGIN
-{
+
 namespace kernels
 {
@ -30,4 +31,5 @@ void invokeBuildRelativeAttentionBias(T* relative_attention_bias, T const* relat
    cudaStream_t stream);
 } // namespace kernels
-} // namespace tensorrt_llm
+
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu
+++ b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu
@ -19,12 +19,15 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
 #include "tensorrt_llm/kernels/causalConv1d/causalConv1d.h"
-namespace tensorrt_llm::kernels::causal_conv1d
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::causal_conv1d
 {
 template <int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
@ -490,4 +493,6 @@ template void causal_conv1d_update_cuda<float, float>(ConvParamsBase& params, cu
 template void causal_conv1d_update_cuda<half, half>(ConvParamsBase& params, cudaStream_t stream);
 template void causal_conv1d_update_cuda<nv_bfloat16, nv_bfloat16>(ConvParamsBase& params, cudaStream_t stream);
-} // namespace tensorrt_llm::kernels::causal_conv1d
+} // namespace kernels::causal_conv1d
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h
+++ b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h
@ -20,11 +20,14 @@
 #pragma once
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
-namespace tensorrt_llm::kernels::causal_conv1d
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::causal_conv1d
 {
 #define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError())
@ -214,4 +217,6 @@ void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream);
 template <typename input_t, typename weight_t>
 void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream);
-} // namespace tensorrt_llm::kernels::causal_conv1d
+} // namespace kernels::causal_conv1d
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
@ -13,13 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/reduceKernelUtils.cuh"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
 #include "tensorrt_llm/kernels/quantization.cuh"
 #include <cooperative_groups.h>
-namespace tensorrt_llm::kernels::ar_fusion
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::ar_fusion
 {
 template <int NRanks>
 struct SyncComm
@ -818,4 +821,6 @@ void allreduce_fusion_op(AllReduceFusionParams const& params)
    DISPATCH_RANKS(16);
    TLLM_CHECK_WITH_INFO(false, "allreduce_fusion_kernel: unsupported ranks number!");
 }
-}; // namespace tensorrt_llm::kernels::ar_fusion
+}; // namespace kernels::ar_fusion
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h
@ -15,16 +15,19 @@
 */
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include <NvInferRuntime.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
-#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/quantization.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
-namespace tensorrt_llm::kernels::ar_fusion
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::ar_fusion
 {
 template <typename DType>
 struct ElemsPerAccess;
@ -139,4 +142,6 @@ struct AllReduceFusionParams
 };
 void allreduce_fusion_op(AllReduceFusionParams const& params);
-} // namespace tensorrt_llm::kernels::ar_fusion
+} // namespace kernels::ar_fusion
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu
@ -13,9 +13,12 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
-namespace tensorrt_llm::kernels::ar_fusion
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::ar_fusion
 {
 __global__ void lamport_initialize_kernel(float* ptr, int size)
@ -94,4 +97,6 @@ void** Workspace::get_workspace()
 {
    return reinterpret_cast<void**>(m_workspace);
 }
-}; // namespace tensorrt_llm::kernels::ar_fusion
+}; // namespace kernels::ar_fusion
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h
@ -16,11 +16,14 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
-namespace tensorrt_llm::kernels::ar_fusion
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::ar_fusion
 {
 class Workspace
@ -41,4 +44,6 @@ private:
 };
 void lamport_initialize(void* ptr, int bytes, cudaStream_t stream);
-} // namespace tensorrt_llm::kernels::ar_fusion
+} // namespace kernels::ar_fusion
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu
@ -14,6 +14,7 @@
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
 #include "tensorrt_llm/common/cudaUtils.h"
@ -25,7 +26,9 @@
 #include <tuple>
 #include <type_traits>
-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 using tensorrt_llm::common::divUp;
@ -1632,4 +1635,6 @@ void customLowPrecisionAllReduce(
    sync_check_cuda_error(stream);
 }
-} // namespace tensorrt_llm::kernels
+} // namespace kernels
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h
@ -17,6 +17,7 @@
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/customAllReduceKernels.h"
 #include <NvInferRuntime.h>
@ -24,7 +25,9 @@
 #include <cuda_fp16.h>
 #include <vector>
-namespace tensorrt_llm::kernels
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels
 {
 constexpr int LP_ALLREDUCE_MAX_BLOCKS = 8;
@ -119,4 +122,6 @@ void customLowPrecisionAllReduce(
    kernels::LowPrecisionAllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream);
 int32_t max_workspace_size_lowprecision(int32_t tp_size);
-} // namespace tensorrt_llm::kernels
+} // namespace kernels
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu
@ -14,6 +14,7 @@
 * limitations under the License.
 */
 #include "mnnvlAllreduceKernels.h"
 #include "tensorrt_llm/common/config.h"
 #include <cooperative_groups.h>
 #include <cstddef>
 #include <cstdint>
@ -31,7 +32,9 @@
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/reduceKernelUtils.cuh"
-namespace tensorrt_llm::kernels::mnnvl
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::mnnvl
 {
 using tensorrt_llm::common::isNegZero;
@ -1029,4 +1032,6 @@ void twoshotAllreduceFusionOp(AllReduceFusionParams const& params)
    }
 }
-} // namespace tensorrt_llm::kernels::mnnvl
+} // namespace kernels::mnnvl
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h
@ -16,11 +16,13 @@
 #ifndef TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
 #define TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
 #include "tensorrt_llm/common/config.h"
 #include <NvInferRuntime.h>
 #include <cstdint>
-namespace tensorrt_llm::kernels::mnnvl
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::mnnvl
 {
 /**
@ -66,6 +68,7 @@ struct AllReduceFusionParams
 void oneshotAllreduceFusionOp(AllReduceFusionParams const& params);
 void twoshotAllreduceFusionOp(AllReduceFusionParams const& params);
-} // namespace tensorrt_llm::kernels::mnnvl
+} // namespace kernels::mnnvl
 TRTLLM_NAMESPACE_END
 #endif // TRTLLM_MNNVL_ALLREDUCE_KERNELS_H
--- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu
@ -13,13 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/reduceKernelUtils.cuh"
 #include "tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h"
 #include "tensorrt_llm/kernels/quantization.cuh"
 #include <cooperative_groups.h>
-namespace tensorrt_llm::kernels::ar_fusion::moe
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::ar_fusion::moe
 {
 template <int NRanks>
 struct LamportComm
@ -770,4 +773,6 @@ void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& par
 #undef MOE_FINALIZE_DISPATCH1
 }
-}; // namespace tensorrt_llm::kernels::ar_fusion::moe
+}; // namespace kernels::ar_fusion::moe
 TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h
@ -15,16 +15,19 @@
 */
 #pragma once
 #include "tensorrt_llm/common/assert.h"
 #include <NvInferRuntime.h>
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
-#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/quantization.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
-namespace tensorrt_llm::kernels::ar_fusion::moe
+TRTLLM_NAMESPACE_BEGIN
 namespace kernels::ar_fusion::moe
 {
 static constexpr int kElemsPerAccess = 8;
 static constexpr int kOneShotMaxToken = 128;
@ -102,4 +105,6 @@ struct MoeFinalizeAllReduceFusionParams : public AllReduceFusionParams
 void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& params);
-} // namespace tensorrt_llm::kernels::ar_fusion::moe
+} // namespace kernels::ar_fusion::moe
 TRTLLM_NAMESPACE_END
--- a/Show More
+++ b/Show More