[None][chroe] Rename TensorRT-LLM to TensorRT LLM for source code. (#7851)

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-09-23 01:05:47 +08:00 · 2025-09-23 01:05:47 +08:00 · 57079cecb3
commit 57079cecb3
parent 68b7900a1d
148 changed files with 311 additions and 311 deletions
--- a/README.md
+++ b/README.md
@ -25,7 +25,7 @@ TensorRT LLM
 * [08/01] Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)

-* [07/26] N-Gram Speculative Decoding in TensorRT‑LLM
+* [07/26] N-Gram Speculative Decoding in TensorRT LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md)

 * [06/19] Disaggregated Serving in TensorRT LLM
--- a/benchmarks/cpp/bertBenchmark.cpp
+++ b/benchmarks/cpp/bertBenchmark.cpp
@ -135,7 +135,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da

 int main(int argc, char* argv[])
 {
-    cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT.");
+    cxxopts::Options options("TensorRT LLM C++ Runtime Benchmark", "TensorRT LLM C++ Runtime Benchmark for BERT.");
    options.add_options()("h,help", "Print usage");
    options.add_options()(
        "m,model", "Model name specified for engines.", cxxopts::value<std::string>()->default_value("bert_base"));
--- a/benchmarks/cpp/disaggServerBenchmark.cpp
+++ b/benchmarks/cpp/disaggServerBenchmark.cpp
@ -1145,7 +1145,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,
 int main(int argc, char* argv[])

 {
-    cxxopts::Options options("TensorRT-LLm DisaggServer Benchmark");
+    cxxopts::Options options("TensorRT LLM DisaggServer Benchmark");
    options.add_options()("h,help", "Print usage");
    options.add_options()("context_engine_dirs", "Directories that store context engines,separator is a ,",
        cxxopts::value<std::vector<std::string>>());
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@ -1055,7 +1055,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
 int main(int argc, char* argv[])
 {
    cxxopts::Options options(
-        "TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models.");
+        "TensorRT LLM BatchManager Benchmark", "TensorRT LLM BatchManager Benchmark for GPT and GPT-like models.");
    options.add_options()("h,help", "Print usage");
    options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.",
        cxxopts::value<std::string>());
--- a/cpp/include/tensorrt_llm/deep_gemm/compiler.cuh
+++ b/cpp/include/tensorrt_llm/deep_gemm/compiler.cuh
@ -217,7 +217,7 @@ std::vector<std::filesystem::path> getJitIncludeDirs()
        }
        else
        {
-            TLLM_LOG_WARNING("Failed to find TensorRT-LLM installation, DeepGEMM will be disabled.");
+            TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled.");
        }
    }
    return includeDirs;
--- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@ -165,7 +165,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
        {
            void* ret = dllGetSym(handle, name);
            TLLM_CHECK_WITH_INFO(ret != nullptr,
-                "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
+                "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
                "built with UCX support, please rebuild in UCX-enabled environment.");
            return ret;
        };
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h
@ -105,7 +105,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm100(T* D, void const* A, void const*
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
        break;
    }
 }
@ -146,15 +146,15 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
            occupancy);
        break;
    case tkc::CutlassTileConfigSM100::Undefined:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
        break;
    case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
            "heuristic.");
        break;
    default:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
        break;
    }
 }
@ -177,7 +177,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm120(T* D, void const* A, void const*
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
        break;
    }
 }
@ -205,16 +205,16 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm120(T* D, void const* A, void const* B,
            occupancy);
        break;
    case tkc::CutlassTileConfigSM120::Undefined:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
        break;
    case tkc::CutlassTileConfigSM120::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
+            "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
            "heuristic.");
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
        break;
    }
 }
@ -257,7 +257,7 @@ size_t dispatchMXFP8xMXFP4GemmClusterShapeSm100(T* D, void const* A, void const*
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
        break;
    }
 }
@ -293,15 +293,15 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
            occupancy);
        break;
    case tkc::CutlassTileConfigSM100::Undefined:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
        break;
    case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
            "heuristic.");
        break;
    default:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
        break;
    }
 }
@ -338,7 +338,7 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
        else
        {
            throw std::runtime_error(
-                "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
+                "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
        }
    }
    else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4)
@ -356,13 +356,13 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
        else
        {
            throw std::runtime_error(
-                "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
+                "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
        }
    }
    else
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
+            "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
    }
 }

--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h
@ -93,7 +93,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
    int* occupancy)
 {
    throw std::runtime_error(
-        "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture.");
+        "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture.");
 }

 #else
@ -250,7 +250,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
    {
        std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
            + std::to_string(mMaxSmemSize);
-        throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
    }
    /* // Return workspace size */
    if (!A && !B && !D)
@ -261,28 +261,28 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
    {
        std::string errMsg("Requested workspace size insufficient. Required "
            + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
-        throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
    }
    auto can_implement = gemm.can_implement(args);
    if (can_implement != cutlass::Status::kSuccess)
    {
        std::string errMsg = "MXFP8xMXFP4 Gemm cutlass kernel will fail for params. Error: "
            + std::string(cutlassGetStatusString(can_implement));
-        throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
    }
    auto initStatus = gemm.initialize(args, workspace, stream);
    if (initStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg = "Failed to initialize cutlass MXFP8xMXFP4 gemm. Error: "
            + std::string(cutlassGetStatusString(initStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
    }
    auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());
    if (runStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg
            = "Failed to run cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
    }
    return gemm.get_workspace_size(args);
 }
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h
@ -107,7 +107,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
        int* occupancy)                                                                                                \
    {                                                                                                                  \
        throw std::runtime_error(                                                                                      \
-            "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
+            "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
    }

 #else
@ -268,7 +268,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
        {                                                                                                              \
            std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
                + std::to_string(mMaxSmemSize);                                                                        \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        /* // Return workspace size */                                                                                 \
        if (!A && !B && !D)                                                                                            \
@ -279,28 +279,28 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
        {                                                                                                              \
            std::string errMsg("Requested workspace size insufficient. Required "                                      \
                + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));          \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        auto can_implement = gemm.can_implement(args);                                                                 \
        if (can_implement != cutlass::Status::kSuccess)                                                                \
        {                                                                                                              \
            std::string errMsg = "FP4 Gemm cutlass kernel will fail for params. Error: "                               \
                + std::string(cutlassGetStatusString(can_implement));                                                  \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        auto initStatus = gemm.initialize(args, workspace, stream);                                                    \
        if (initStatus != cutlass::Status::kSuccess)                                                                   \
        {                                                                                                              \
            std::string errMsg                                                                                         \
                = "Failed to initialize cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(initStatus));  \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());          \
        if (runStatus != cutlass::Status::kSuccess)                                                                    \
        {                                                                                                              \
            std::string errMsg                                                                                         \
                = "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));          \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        return gemm.get_workspace_size(args);                                                                          \
    }
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h
@ -69,7 +69,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
        int* occupancy)                                                                                                \
    {                                                                                                                  \
        throw std::runtime_error(                                                                                      \
-            "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
+            "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
    }

 #else
@ -224,7 +224,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
        {                                                                                                              \
            std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
                + std::to_string(mMaxSmemSize);                                                                        \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        /* // Return workspace size */                                                                                 \
        if (!A && !B && !D)                                                                                            \
@ -235,7 +235,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
        {                                                                                                              \
            std::string errMsg("Requested workspace size insufficient. Required "                                      \
                + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));          \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        auto initStatus = gemm.initialize(args, workspace);                                                            \
        if (initStatus != cutlass::Status::kSuccess)                                                                   \
@ -243,14 +243,14 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
            auto cudaErrMsg = cudaGetErrorString(cudaGetLastError());                                                  \
            std::string errMsg = "Failed to initialize cutlass FP4 gemm. Error: "                                      \
                + std::string(cutlass::cutlassGetStatusString(initStatus)) + " " + cudaErrMsg;                         \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());          \
        if (runStatus != cutlass::Status::kSuccess)                                                                    \
        {                                                                                                              \
            std::string errMsg                                                                                         \
                = "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlass::cutlassGetStatusString(runStatus)); \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
        }                                                                                                              \
        return gemm.get_workspace_size(args);                                                                          \
    }
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
@ -75,7 +75,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
    {
        std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
            + std::to_string(mMaxSmemSize);
-        throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
    }

    // Return workspace size
@ -88,7 +88,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
    {
        std::string errMsg("Requested workspace size insufficient. Required "
            + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
-        throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
    }

    auto can_implement = gemm.can_implement(args);
@ -96,21 +96,21 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
    {
        std::string errMsg = "fp8RowwiseGemm cutlass kernel not implemented given the params. Error: "
            + std::string(cutlassGetStatusString(can_implement));
-        throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
    }

    auto initStatus = gemm.initialize(args, workspace, stream);
    if (initStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
    }

    auto runStatus = gemm.run(stream);
    if (runStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
    }
    return gemm.get_workspace_size(args);
 }
@ -210,7 +210,7 @@ size_t dispatchGemmConfigSm89(void* D, void const* A, void const* B, void const*
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for "
            "Fp8 Rowwise GEMM.");
        break;
    }
@ -299,16 +299,16 @@ size_t dispatchGemmToCutlassSm89(void* D, void const* A, void const* B, void con

    case tkc::CutlassTileConfig::Undefined:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined.");
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined.");
        break;
    case tkc::CutlassTileConfig::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have "
            "already been set by heuristic.");
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for "
            "Fp8 Rowwise GEMM.");
        break;
    }
@ -379,7 +379,7 @@ size_t genericFp8RowwiseGemmKernelLauncherSm90(void* D, void const* A, void cons
        Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
 #else  // COMPILE_HOPPER_TMA_GEMMS
    throw std::runtime_error(
-        "[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
+        "[TensorRT LLM Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
        "90-real as an arch to build_wheel.py.");
 #endif // COMPILE_HOPPER_TMA_GEMMS
 }
@ -418,7 +418,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const*
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for "
            "Fp8 Rowwise GEMM.");
        break;
    }
@ -468,16 +468,16 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con
        break;
    case tkc::CutlassTileConfigSM90::Undefined:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
        break;
    case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
            "already been set by heuristic.");
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for "
            "Fp8 Rowwise GEMM.");
        break;
    }
@ -517,7 +517,7 @@ size_t CutlassFp8RowwiseGemmRunner<T>::dispatchToArch(void* D, void const* A, vo
 #endif
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
            "Fp8 Rowwise GEMM");
    }
    return 0;
@ -585,7 +585,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFp8RowwiseGemmRunner<T>::getConfigs()
    else
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
+            "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
            "Fp8 Rowwise GEMM");
    }
    return candidateConfigs;
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@ -209,7 +209,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
    {
        std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: "
            + std::string(cutlassGetStatusString(can_implement));
-        throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
    }

    auto init_status = gemm.initialize(args, workspace, stream);
@ -217,7 +217,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
    {
        std::string err_msg
            = "Failed to initialize cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(init_status));
-        throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
    }

    auto run_status = gemm.run(stream);
@ -225,7 +225,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
    {
        std::string err_msg
            = "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status));
-        throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
    }
 }

@ -247,14 +247,14 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca
        // Multistage only supported on Ampere
        std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
            + std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
-        throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
    }
    else if constexpr (Stages == 2 && arch::kMinComputeCapability >= 89)
    {
        // Multistage only supported on Ampere
        std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
            + std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
-        throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
    }
    else if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value
        && arch::kMinComputeCapability < 89)
@ -262,7 +262,7 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca
        // FP8 activation type only supported on Ada+ GPUs
        std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
            + std::to_string(arch::kMinComputeCapability) + " with activation type set to FP8";
-        throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
    }
    else
    {
@ -301,7 +301,7 @@ void dispatch_gemm_config(ActivationType const* A, WeightType const* B, ScaleZer
        break;
    default:
        std::string err_msg = "dispatch_gemm_config does not support stages " + std::to_string(gemm_config.stages);
-        throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_config] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + err_msg);
        break;
    }
 }
@ -370,16 +370,16 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal
                C, m, n, k, group_size, gemm_config, workspace, workspace_bytes, stream, occupancy);
            break;
        case tkc::CutlassTileConfig::Undefined:
-            throw std::runtime_error("[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
+            throw std::runtime_error("[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
            break;
        case tkc::CutlassTileConfig::ChooseWithHeuristic:
            throw std::runtime_error(
-                "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by "
+                "[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by "
                "heuristic.");
            break;
        default:
            throw std::runtime_error(
-                "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
+                "[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
            break;
        }
    }
@ -387,7 +387,7 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal
    {
        // This is not a limitation in CUTLASS. We just do not need to support this case.
        std::string err_msg = "The activation type must equal the scale, bias and output types on Ampere and earlier.";
-        throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_to_cutlass] " + err_msg);
+        throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_to_cutlass] " + err_msg);
    }
 }

@ -439,7 +439,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
        if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value)
        {
            throw std::runtime_error(
-                "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs "
+                "[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs "
                "CUDA>=12.4");
        }
 #endif
@ -459,7 +459,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
    else
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for CUTLASS mixed type "
+            "[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for CUTLASS mixed type "
            "GEMM");
    }
 }
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
@ -62,7 +62,7 @@ void sm90_dispatch_epilogue_schedules(ActivationType const* A, WeightType const*
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule config is invalid for "
+            "[TensorRT LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule config is invalid for "
            "mixed "
            "type GEMM.");
        break;
@ -135,7 +135,7 @@ void sm90_dispatch_mainloop_schedules(ActivationType const* A, WeightType const*
            break;
        default:
            throw std::runtime_error(
-                "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule config is invalid "
+                "[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule config is invalid "
                "for "
                "mixed type GEMM.");
            break;
@ -144,7 +144,7 @@ void sm90_dispatch_mainloop_schedules(ActivationType const* A, WeightType const*
    else
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and Cluster shapes for "
+            "[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and Cluster shapes for "
            "mixed type GEMM.");
    }
 }
@ -181,7 +181,7 @@ void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B, Sca
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type GEMM.");
+            "[TensorRT LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type GEMM.");
        break;
    }
 }
@ -254,16 +254,16 @@ void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
        break;
    case tkc::CutlassTileConfigSM90::Undefined:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
+            "[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
        break;
    case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have already been set by "
+            "[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have already been set by "
            "heuristic.");
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
+            "[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
        break;
    }
 }
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
@ -193,7 +193,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
            if (group_size % cta_shape_k != 0)
            {
                std::string err_msg = "The group size must a multiple of " + std::to_string(cta_shape_k);
-                throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner]" + err_msg);
+                throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner]" + err_msg);
            }

            if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)
@ -249,7 +249,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
        Gemm gemm;
        if (gemm.get_workspace_size(args) > workspace_bytes)
        {
-            TLLM_LOG_ERROR("[TensorRT-LLm Error][fpA_intB Runner] given workspace size insufficient.");
+            TLLM_LOG_ERROR("[TensorRT LLM Error][fpA_intB Runner] given workspace size insufficient.");
        }

        auto can_implement = gemm.can_implement(args);
@ -258,7 +258,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
            std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: "
                + std::string(cutlassGetStatusString(can_implement));
            std::cout << err_msg << std::endl;
-            throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+            throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
        }

        auto init_status = gemm.initialize(args, workspace, stream);
@ -266,7 +266,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
        {
            std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: "
                + std::string(cutlassGetStatusString(init_status));
-            throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+            throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
        }

        auto run_status = gemm.run(stream);
@ -274,13 +274,13 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
        {
            std::string err_msg
                = "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status));
-            throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+            throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
        }
    }
    else
    {
        std::stringstream ss;
-        ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << ","
+        ss << "[TensorRT LLM Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << ","
           << (int64_t) cute::size<1>(CTAShape{}) << "," << (int64_t) cute::size<2>(CTAShape{}) << ") ("
           << (int64_t) cute::size<0>(ClusterShape{}) << "," << (int64_t) cute::size<1>(ClusterShape{}) << ","
           << (int64_t) cute::size<2>(ClusterShape{}) << ") not compiled with FAST_BUILD.";
@ -290,7 +290,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType

 #else  // COMPILE_HOPPER_TMA_GEMMS
    throw std::runtime_error(
-        "[TensorRT-LLm Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch "
+        "[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch "
        "to build_wheel.py.");
 #endif // COMPILE_HOPPER_TMA_GEMMS
 }
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h
@ -67,7 +67,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
    {
        std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
            + std::to_string(mMaxSmemSize);
-        throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
    }

    // Return workspace size
@ -80,7 +80,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
    {
        std::string errMsg("Requested workspace size insufficient. Required "
            + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
-        throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
    }

    auto can_implement = gemm.can_implement(args);
@ -88,21 +88,21 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
    {
        std::string errMsg = "fusedGatedGemm cutlass kernel not implemented given the params. Error: "
            + std::string(cutlassGetStatusString(can_implement));
-        throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
    }

    auto initStatus = gemm.initialize(args, workspace, stream);
    if (initStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
    }

    auto runStatus = gemm.run(stream);
    if (runStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
    }
    return gemm.get_workspace_size(args);
 }
@ -165,7 +165,7 @@ size_t genericGemmGatedKernelLauncherSm90(void* D, void const* A, void const* B,
    return typedGemmGatedKernelLauncher(Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
 #else  // COMPILE_HOPPER_TMA_GEMMS
    throw std::runtime_error(
-        "[TensorRT-LLm Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real "
+        "[TensorRT LLM Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real "
        "as an arch to build_wheel.py.");
 #endif // COMPILE_HOPPER_TMA_GEMMS
 }
@ -204,7 +204,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const*
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused "
+            "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused "
            "gated GEMM.");
        break;
    }
@ -255,17 +255,17 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con
        break;
    case tkc::CutlassTileConfigSM90::Undefined:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
+            "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
        break;
    case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
+            "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
            "already been set by "
            "heuristic.");
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused "
+            "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused "
            "gated GEMM.");
        break;
    }
@ -302,14 +302,14 @@ size_t CutlassFusedGatedGemmRunner<T>::dispatchToArch(void* D, void const* A, vo
 #endif
        {
            throw std::runtime_error(
-                "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
+                "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
                "gated GEMM");
        }
    }
    else
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
+            "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
            "gated "
            "GEMM");
    }
@ -340,7 +340,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFusedGatedGemmRunner<T>::getConfigs()
        if (mSm != 90)
        {
            throw std::runtime_error(
-                "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
+                "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
                "gated GEMM");
        }
        tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param
@ -378,7 +378,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFusedGatedGemmRunner<T>::getConfigs()
    else
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
+            "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
            "gated "
            "GEMM");
    }
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h
@ -150,7 +150,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
    {
        std::string errMsg = "int8gemm cutlass kernel will fail for params. Error: "
            + std::string(cutlassGetStatusString(can_implement));
-        throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
    }

    auto initStatus = gemm.initialize(args, workspace, stream);
@ -158,7 +158,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
    {
        std::string errMsg
            = "Failed to initialize cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(initStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
    }

    auto runStatus = gemm.run(stream);
@ -166,7 +166,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
    {
        std::string errMsg
            = "Failed to run cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
    }
 }

@ -180,7 +180,7 @@ struct dispatchStages
        TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
        std::string errMsg = "Cutlass int8 gemm. Not instantiates for arch "
            + std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
-        throw std::runtime_error("[TensorRT-LLM Error][dispatchStages::dispatch] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][dispatchStages::dispatch] " + errMsg);
    }
 };

@ -248,7 +248,7 @@ void dispatchGemmConfig(int8_t const* A, int8_t const* B, tk::QuantMode quantOpt
        break;
    default:
        std::string errMsg = "dispatchGemmConfig does not support stages " + std::to_string(gemmConfig.stages);
-        throw std::runtime_error("[TensorRT-LLM Error][dispatch_gemm_config] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + errMsg);
        break;
    }
 }
@ -288,16 +288,16 @@ void dispatchGemmToCutlass(int8_t const* A, int8_t const* B, tk::QuantMode quant
            quantOption, alphaCol, alphaRow, C, m, n, k, gemmConfig, workspace, workspaceBytes, stream, occupancy);
        break;
    case tkc::CutlassTileConfig::Undefined:
-        throw std::runtime_error("[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined.");
+        throw std::runtime_error("[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined.");
        break;
    case tkc::CutlassTileConfig::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by "
+            "[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by "
            "heuristic.");
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM.");
+            "[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM.");
        break;
    }
 }
@ -342,7 +342,7 @@ void CutlassInt8GemmRunner<T>::dispatchToArch(int8_t const* A, int8_t const* B,
    else
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
+            "[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
    }
 }

@ -364,7 +364,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassInt8GemmRunner<T>::getConfigs() const
    if (mSm <= 70)
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
+            "[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
    }

    std::vector<tkc::CutlassGemmConfig> candidateConfigs = get_candidate_configs(mSm, SPLIT_K_LIMIT, config_type_param);
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h
@ -195,7 +195,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
    {
        std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
            + std::to_string(mMaxSmemSize);
-        throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
    }

    // Return workspace size
@ -208,7 +208,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
    {
        std::string errMsg("Requested workspace size insufficient. Required "
            + std::to_string(gemm.get_workspace_size(arguments)) + ", got " + std::to_string(workspaceBytes));
-        throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
    }

    auto can_implement = gemm.can_implement(arguments);
@ -216,26 +216,26 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
    {
        std::string errMsg = "Fp8LowLatencyGemm cutlass kernel not implemented given the params. Error: "
            + std::string(cutlassGetStatusString(can_implement));
-        throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
    }

    auto initStatus = gemm.initialize(arguments, workspacePtr);
    if (initStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
    }

    auto runStatus = gemm.run(stream, nullptr, pdl_overlap_ratio >= 0);
    if (runStatus != cutlass::Status::kSuccess)
    {
        std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
    }
    return gemm.get_workspace_size(arguments);
 #else  // COMPILE_HOPPER_TMA_GEMMS
    throw std::runtime_error(
-        "[TensorRT-LLm Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by "
+        "[TensorRT LLM Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by "
        "passing 90-real as an arch to build_wheel.py.");
 #endif // COMPILE_HOPPER_TMA_GEMMS
 }
@ -264,7 +264,7 @@ size_t dispatchLowLatencyGemmCultassKernelSchedSm90(__nv_fp8_e4m3 const* A, __nv
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config "
+            "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config "
            "is "
            "invalid for low latency fp8 gemm");
        break;
@ -300,7 +300,7 @@ size_t dispatchLowLatencyGemmClusterShapeSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e

    default:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is "
+            "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is "
            "invalid for low latency fp8 gemm");
        break;
    }
@ -369,19 +369,19 @@ size_t dispatchLowLatencyGemmToCutlassSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e4m3
        break;
    case tkc::CutlassTileConfigSM90::Undefined:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
+            "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
            "undefined.");
        break;
    case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
+            "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
            "should have "
            "already been set by "
            "heuristic.");
        break;
    default:
        throw std::runtime_error(
-            "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is "
+            "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is "
            "invalid for low latency fp8 gemm");
        break;
    }
@ -413,7 +413,7 @@ size_t CutlassLowLatencyFp8GemmRunner<T>::dispatchToArch(__nv_fp8_e4m3 const* A,
    {

        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low "
+            "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low "
            "Latency Gemm");
    }
    return 0;
@ -499,7 +499,7 @@ std::vector<ConfigType> CutlassLowLatencyFp8GemmRunner<T>::getConfigs() const
    if (mSm != 90)
    {
        throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low "
+            "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low "
            "Latency GEMM");
    }
    tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@ -235,12 +235,12 @@ struct BatchedGemmData
        void const* mPtrBias{nullptr};

        // The output tensor scaling factor for MxFp{4,8}, Fp8 and NvFp4 quantization.
-        // TensorRT-LLM API requires a scaling factor on the device.
+        // TensorRT LLM API requires a scaling factor on the device.
        // Shape is [B].
        float const* mPtrScaleC{nullptr};

        // The output gate scale for MxFp{4,8} and NvFp4 quantization.
-        // TensorRT-LLM API requires a scaling factor on the device.
+        // TensorRT LLM API requires a scaling factor on the device.
        // Shape is [B].
        float const* mPtrScaleGate{nullptr};

--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h
@ -214,12 +214,12 @@ struct KernelParams
    //     ScaleC    = SEncC
    //
    // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
-    // TensorRT-LLM API requires a scaling factor on the device.
+    // TensorRT LLM API requires a scaling factor on the device.
    // Shape is [B]. One scaling factor per tensor in batch.
    float const* ptrScaleC{nullptr};

    // The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
-    // TensorRT-LLM API requires a scaling factor on the device.
+    // TensorRT LLM API requires a scaling factor on the device.
    // Shape is [B]. One scaling factor per tensor in batch.
    float const* ptrScaleGate{nullptr};

--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h
@ -143,7 +143,7 @@ struct GemmData
        void const* mPtrPerTokenSfB{nullptr};

        // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
-        // TensorRT-LLM API requires a scaling factor on the device.
+        // TensorRT LLM API requires a scaling factor on the device.
        // Shape is [1].
        void* mPtrScaleC{nullptr};
    };
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h
@ -204,7 +204,7 @@ struct KernelParams
    void* ptrSfC;

    // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
-    // TensorRT-LLM API requires a scaling factor on the device.
+    // TensorRT LLM API requires a scaling factor on the device.
    // Shape is [1].
    float const* ptrScaleC;

--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h
@ -133,11 +133,11 @@ struct GemmGatedActData
        void const* mPtrPerTokenSfB{nullptr};

        // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
-        // TensorRT-LLM API requires a scaling factor on the device.
+        // TensorRT LLM API requires a scaling factor on the device.
        // Shape is [1].
        void const* mPtrScaleC{nullptr};
        // The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
-        // TensorRT-LLM API requires a scaling factor on the device.
+        // TensorRT LLM API requires a scaling factor on the device.
        // Shape is [1].
        void const* mPtrScaleGate{nullptr};
    };
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h
@ -290,7 +290,7 @@ struct KernelParams
    // y = act(ptrScaleGate[0] * y1) * (ptrScaleC[0] * y2)
    //
    // The output tensor scaling factor for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
-    // TensorRT-LLM API requires a scaling factor on the device.
+    // TensorRT LLM API requires a scaling factor on the device.
    // Shape is [1].
    float const* ptrScaleC;
    // The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
--- a/cpp/tensorrt_llm/nanobind/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/bindings.cpp
@ -73,7 +73,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con

 NB_MODULE(TRTLLM_NB_MODULE, m)
 {
-    m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
+    m.doc() = "TensorRT LLM Python bindings for C++ runtime";
    m.attr("binding_type") = "nanobind";
    nb::set_leak_warnings(false);

--- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
@ -125,7 +125,7 @@ BertAttentionPlugin::BertAttentionPlugin(void const* data, size_t length)

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/cudaStreamPlugin/cudaStreamPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/cudaStreamPlugin/cudaStreamPlugin.cpp
@ -48,7 +48,7 @@ CudaStreamPlugin::CudaStreamPlugin(void const* data, size_t length)

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp
@ -58,7 +58,7 @@ EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin(void const* data, siz
    read(d, mTopKSampling);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        static_cast<int>(length), static_cast<int>(d - a));
 }
--- a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp
@ -52,7 +52,7 @@ EagleSampleAndAcceptDraftTokensPlugin::EagleSampleAndAcceptDraftTokensPlugin(voi
    read(d, mDtype);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/fusedLayernormPlugin/fusedLayernormPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/fusedLayernormPlugin/fusedLayernormPlugin.cpp
@ -47,7 +47,7 @@ FusedLayernormPlugin::FusedLayernormPlugin(void const* data, size_t length)
    read(d, mType);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp
@ -203,7 +203,7 @@ static GemmAllReducePluginOptions deserializeOptions(void const*& data, size_t l

    TLLM_CHECK_WITH_INFO(end == begin + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (end - begin));

--- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp
@ -179,7 +179,7 @@ GemmPlugin::GemmPlugin(void const* data, size_t length, GemmPlugin::PluginProfil

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
@ -183,7 +183,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng
    }
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
    TLLM_CHECK_WITH_INFO((smVersion() >= 80) || (mType != nvinfer1::DataType::kBF16),
--- a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp
@ -35,7 +35,7 @@ IdentityPlugin::IdentityPlugin(void const* data, size_t length)
    char const *d = reinterpret_cast<char const*>(data), *a = d;
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp
@ -61,7 +61,7 @@ LayernormQuantizationPlugin::LayernormQuantizationPlugin(void const* data, size_
    read(d, mOutputType);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp
@ -48,7 +48,7 @@ LookupPlugin::LookupPlugin(void const* data, size_t length)
    read(d, mRank);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
@ -78,7 +78,7 @@ LoraPlugin::LoraPlugin(void const* data, size_t length, LoraPlugin::PluginProfil

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/lowLatencyGemmPlugin/lowLatencyGemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/lowLatencyGemmPlugin/lowLatencyGemmPlugin.cpp
@ -124,7 +124,7 @@ LowLatencyGemmPlugin::LowLatencyGemmPlugin(void const* data, size_t length, Plug
    mPluginProfiler->deserialize(d, mDims, mGemmId);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/lowLatencyGemmSwigluPlugin/lowLatencyGemmSwigluPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/lowLatencyGemmSwigluPlugin/lowLatencyGemmSwigluPlugin.cpp
@ -159,7 +159,7 @@ LowLatencyGemmSwigluPlugin::LowLatencyGemmSwigluPlugin(
    mPluginProfiler->deserialize(d, mDims, mGemmId);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
@ -175,7 +175,7 @@ MixtureOfExpertsPlugin::MixtureOfExpertsPlugin(void const* data, size_t length,

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp
@ -48,7 +48,7 @@ AllgatherPlugin::AllgatherPlugin(void const* data, size_t length)
    }
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@ -77,7 +77,7 @@ AllreducePlugin::AllreducePlugin(void const* data, size_t length)
    }
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
    check();
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp
@ -45,7 +45,7 @@ RecvPlugin::RecvPlugin(void const* data, size_t length)
    read(d, mSrcRank);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp
@ -48,7 +48,7 @@ ReduceScatterPlugin::ReduceScatterPlugin(void const* data, size_t length)
    }
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp
@ -46,7 +46,7 @@ SendPlugin::SendPlugin(void const* data, size_t length)
    read(d, mTgtRank);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/qserveGemmPlugin/qserveGemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/qserveGemmPlugin/qserveGemmPlugin.cpp
@ -64,7 +64,7 @@ QServeGemmPlugin::QServeGemmPlugin(void const* data, size_t length)

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp
@ -51,7 +51,7 @@ QuantizePerTokenPlugin::QuantizePerTokenPlugin(void const* data, size_t length)
    read(d, mSumPerToken);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp
@ -35,7 +35,7 @@ QuantizeTensorPlugin::QuantizeTensorPlugin(void const* data, size_t length)
    char const *d = reinterpret_cast<char const*>(data), *a = d;
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp
+++ b/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp
@ -41,7 +41,7 @@ QuantizeToFP4Plugin::QuantizeToFP4Plugin(void const* data, size_t length)
    char const *d = reinterpret_cast<char const*>(data), *a = d;
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp
@ -58,7 +58,7 @@ RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(void const* data, size_t le
    read(d, mOutputType);
    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
@ -98,7 +98,7 @@ SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp
@ -148,7 +148,7 @@ WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp
@ -126,7 +126,7 @@ WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(

    TLLM_CHECK_WITH_INFO(d == a + length,
        "Expected length (%d) != real length (%d). This is often "
-        "caused by using different TensorRT-LLM version to build "
+        "caused by using different TensorRT LLM version to build "
        "engine and run engine.",
        (int) length, (int) (d - a));
 }
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@ -67,7 +67,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con

 PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
 {
-    m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
+    m.doc() = "TensorRT LLM Python bindings for C++ runtime";
    m.attr("binding_type") = "pybind";

    // Create MpiComm binding first since it's used in the executor bindings
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.h
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h
@ -56,7 +56,7 @@ public:
    }

    /// @brief If multiple TensorRT optimization profiles are built in the engine, this function selects the
-    /// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT-LLM only split
+    /// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT LLM only split
    /// multiple profiles on the num_tokens dimension, hence the profile index is selected based on which profile
    /// handles the actual num_tokens
    /// @return The index of the selected TensorRT optimization profile
--- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp
+++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
@ -330,7 +330,7 @@ protected:
            {
                void* ret = dllGetSym(handle, name);
                TLLM_CHECK_WITH_INFO(ret != nullptr,
-                    "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
+                    "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
                    "built with UCX support, please rebuild in UCX-enabled environment.");
                return ret;
            };
@ -732,7 +732,7 @@ protected:
                {
                    void* ret = dllGetSym(handle, name);
                    TLLM_CHECK_WITH_INFO(ret != nullptr,
-                        "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
+                        "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
                        "built with UCX support, please rebuild in UCX-enabled environment.");
                    return ret;
                };
--- a/cpp/tests/unit_tests/executor/ucxCommTest.cpp
+++ b/cpp/tests/unit_tests/executor/ucxCommTest.cpp
@ -70,7 +70,7 @@ std::unique_ptr<texec::kv_cache::ConnectionManager> makeOneUcxConnectionManager(
        void* ret = dllGetSym(handle, name);

        TLLM_CHECK_WITH_INFO(ret != nullptr,
-            "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
+            "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
            "built with UCX support, please rebuild in UCX-enabled environment.");
        return ret;
    };
--- a/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluKernelTestSm90Fp8.cu
+++ b/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluKernelTestSm90Fp8.cu
@ -243,7 +243,7 @@ Result run(std::string description, Options& options, Buffers& buffers)
    auto can_implement = device_gemm.can_implement(arguments);
    if (can_implement != cutlass::Status::kSuccess)
    {
-        throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner]");
+        throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner]");
    }

    // Initialize CUTLASS kernel with arguments and workspace pointer
@ -481,7 +481,7 @@ int main(int argc, char const** argv)
 #ifdef COMPILE_HOPPER_TMA_GEMMS
    Result hopperFp8 = run<Gemm>(std::string("Hopper fp8 swiglu"), options, buffers);
 #else  // COMPILE_HOPPER_TMA_GEMMS
-    std::cout << "[TensorRT-LLm Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by "
+    std::cout << "[TensorRT LLM Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by "
                 "passing 90-real as an arch to build_wheel.py."
              << std::endl;
 #endif // COMPILE_HOPPER_TMA_GEMMS
--- a/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluRunnerTest.cu
+++ b/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluRunnerTest.cu
@ -338,7 +338,7 @@ TEST(GemmSwigluRunner, Sm90FP8)
    Result hopperFp8 = run("SM90 FP8 WS GEMM", options, buffers);
    EXPECT_TRUE(hopperFp8.passed);
 #else  // COMPILE_HOPPER_TMA_GEMMS
-    std::cout << "[TensorRT-LLm Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing "
+    std::cout << "[TensorRT LLM Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing "
                 "90-real as an arch to build_wheel.py."
              << std::endl;
 #endif // COMPILE_HOPPER_TMA_GEMMS
--- a/examples/apps/fastapi_server.py
+++ b/examples/apps/fastapi_server.py
@ -1,6 +1,6 @@
 """
 NOTE: This FastAPI-based server is only an example for demonstrating the usage
-of TensorRT-LLM LLM API. It is not intended for production use.
+of TensorRT LLM LLM API. It is not intended for production use.
 For production, use the `trtllm-serve` command. The server exposes OpenAI compatible API endpoints.
 """

--- a/examples/cpp_library/main.cpp
+++ b/examples/cpp_library/main.cpp
@ -28,11 +28,11 @@ int main(int argc, char* argv[])
        void log(nvinfer1::ILogger::Severity severity, char const* msg) noexcept override
        {
            if (severity <= nvinfer1::ILogger::Severity::kERROR)
-                std::cerr << "[TensorRT-LLM ERR]: " << msg << std::endl;
+                std::cerr << "[TensorRT LLM ERR]: " << msg << std::endl;
            else if (severity == nvinfer1::ILogger::Severity::kWARNING)
-                std::cerr << "[TensorRT-LLM WARNING]: " << msg << std::endl;
+                std::cerr << "[TensorRT LLM WARNING]: " << msg << std::endl;
            else
-                std::cout << "[TensorRT-LLM LOG]: " << msg << std::endl;
+                std::cout << "[TensorRT LLM LOG]: " << msg << std::endl;
        }
    };

--- a/examples/eagle/convert_checkpoint.py
+++ b/examples/eagle/convert_checkpoint.py
@ -144,7 +144,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/generate_checkpoint_config.py
+++ b/examples/generate_checkpoint_config.py
@ -12,7 +12,7 @@ def parse_arguments():
        '--output_path',
        type=str,
        default='config.json',
-        help='The path to save the TensorRT-LLM checkpoint config.json file')
+        help='The path to save the TensorRT LLM checkpoint config.json file')
    parser.add_argument('--architecture', type=str, default='GPTForCausalLM')
    parser.add_argument('--dtype',
                        type=str,
--- a/examples/llm-api/llm_mgmn_llm_distributed.sh
+++ b/examples/llm-api/llm_mgmn_llm_distributed.sh
@ -29,7 +29,7 @@
 #   MOUNT_DIR: the directory to mount in the container
 #   MOUNT_DEST: the destination directory in the container
 #   WORKDIR: the working directory in the container
-#   SOURCE_ROOT: the path to the TensorRT-LLM source
+#   SOURCE_ROOT: the path to the TensorRT LLM source
 #   PROLOGUE: the prologue to run before the script
 #   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
 #      not supported in Slurm mode, you need to download the model and put it in
--- a/examples/llm-api/llm_mgmn_trtllm_bench.sh
+++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh
@ -29,7 +29,7 @@
 #   MOUNT_DIR: the directory to mount in the container
 #   MOUNT_DEST: the destination directory in the container
 #   WORKDIR: the working directory in the container
-#   SOURCE_ROOT: the path to the TensorRT-LLM source
+#   SOURCE_ROOT: the path to the TensorRT LLM source
 #   PROLOGUE: the prologue to run before the script
 #   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
 #      not supported in Slurm mode, you need to download the model and put it in
--- a/examples/llm-api/llm_mgmn_trtllm_serve.sh
+++ b/examples/llm-api/llm_mgmn_trtllm_serve.sh
@ -29,7 +29,7 @@
 #   MOUNT_DIR: the directory to mount in the container
 #   MOUNT_DEST: the destination directory in the container
 #   WORKDIR: the working directory in the container
-#   SOURCE_ROOT: the path to the TensorRT-LLM source
+#   SOURCE_ROOT: the path to the TensorRT LLM source
 #   PROLOGUE: the prologue to run before the script
 #   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
 #      not supported in Slurm mode, you need to download the model and put it in
--- a/examples/llm-api/llm_sampling.py
+++ b/examples/llm-api/llm_sampling.py
@ -161,7 +161,7 @@ def demonstrate_with_logprobs(prompt: str):

 def run_all_demonstrations(model_path: Optional[str] = None):
    """Run all sampling demonstrations."""
-    print("🚀 TensorRT-LLM Sampling Techniques Showcase")
+    print("🚀 TensorRT LLM Sampling Techniques Showcase")
    print("=" * 50)

    # Use the first prompt for most demonstrations
--- a/examples/medusa/convert_checkpoint.py
+++ b/examples/medusa/convert_checkpoint.py
@ -161,7 +161,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/baichuan/convert_checkpoint.py
+++ b/examples/models/contrib/baichuan/convert_checkpoint.py
@ -53,7 +53,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/bloom/convert_checkpoint.py
+++ b/examples/models/contrib/bloom/convert_checkpoint.py
@ -156,7 +156,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=Path,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--calib_dataset',
        type=str,
--- a/examples/models/contrib/cogvlm/convert_checkpoint.py
+++ b/examples/models/contrib/cogvlm/convert_checkpoint.py
@ -190,7 +190,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/dbrx/convert_checkpoint.py
+++ b/examples/models/contrib/dbrx/convert_checkpoint.py
@ -90,7 +90,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/deepseek_v1/convert_checkpoint.py
+++ b/examples/models/contrib/deepseek_v1/convert_checkpoint.py
@ -79,7 +79,7 @@ def parse_arguments():
                        type=str,
                        default='trtllm_checkpoint',
                        required=True,
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/deepseek_v2/convert_checkpoint.py
+++ b/examples/models/contrib/deepseek_v2/convert_checkpoint.py
@ -79,7 +79,7 @@ def parse_arguments():
                        type=str,
                        default='trtllm_checkpoint',
                        required=True,
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/dit/convert_checkpoint.py
+++ b/examples/models/contrib/dit/convert_checkpoint.py
@ -87,7 +87,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument('--input_size',
                        type=int,
                        default=64,
--- a/examples/models/contrib/falcon/convert_checkpoint.py
+++ b/examples/models/contrib/falcon/convert_checkpoint.py
@ -74,7 +74,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/gptj/convert_checkpoint.py
+++ b/examples/models/contrib/gptj/convert_checkpoint.py
@ -61,7 +61,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/gptneox/convert_checkpoint.py
+++ b/examples/models/contrib/gptneox/convert_checkpoint.py
@ -76,7 +76,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/grok/convert_checkpoint.py
+++ b/examples/models/contrib/grok/convert_checkpoint.py
@ -110,7 +110,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/mmdit/convert_checkpoint.py
+++ b/examples/models/contrib/mmdit/convert_checkpoint.py
@ -37,7 +37,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/mpt/convert_checkpoint.py
+++ b/examples/models/contrib/mpt/convert_checkpoint.py
@ -124,7 +124,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/opt/convert_checkpoint.py
+++ b/examples/models/contrib/opt/convert_checkpoint.py
@ -76,7 +76,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/contrib/stdit/convert_checkpoint.py
+++ b/examples/models/contrib/stdit/convert_checkpoint.py
@ -44,7 +44,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument('--caption_channels',
                        type=int,
                        default=4096,
--- a/examples/models/core/bert/convert_checkpoint.py
+++ b/examples/models/core/bert/convert_checkpoint.py
@ -47,7 +47,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/commandr/convert_checkpoint.py
+++ b/examples/models/core/commandr/convert_checkpoint.py
@ -79,7 +79,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/gemma/convert_checkpoint.py
+++ b/examples/models/core/gemma/convert_checkpoint.py
@ -260,7 +260,7 @@ def main() -> None:
            trt_llm_config.query_pre_attn_scalar = ckpt_config.query_pre_attn_scalar

    trt_llm_config_dict = trt_llm_config.to_dict()
-    print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}")
+    print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}")

    save_config(trt_llm_config, output_dir=args.output_model_dir, log=True)

--- a/examples/models/core/glm-4-9b/convert_checkpoint.py
+++ b/examples/models/core/glm-4-9b/convert_checkpoint.py
@ -127,7 +127,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/gpt/convert_checkpoint.py
+++ b/examples/models/core/gpt/convert_checkpoint.py
@ -132,7 +132,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/internlm2/convert_checkpoint.py
+++ b/examples/models/core/internlm2/convert_checkpoint.py
@ -71,7 +71,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/llama/convert_checkpoint.py
+++ b/examples/models/core/llama/convert_checkpoint.py
@ -227,7 +227,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/llama/summarize_long.py
+++ b/examples/models/core/llama/summarize_long.py
@ -51,7 +51,7 @@ def parse_args():
        '--max_input_len',
        type=int,
        default=6400,
-        help='The max input length TensorRT-LLM engine was built with')
+        help='The max input length TensorRT LLM engine was built with')
    parser.add_argument('--log_level', type=str, default='info')
    parser.add_argument('--max_ite', type=int, default=5)
    parser.add_argument(
@ -392,7 +392,7 @@ def main(args):
                        references=[hf_summary[ite][beam_idx][batch_idx]])

        for beam_idx in range(args.num_beams):
-            logger.info(f"TensorRT-LLM beam {beam_idx} result")
+            logger.info(f"TensorRT LLM beam {beam_idx} result")
            computed_metrics_tensorrt_llm = metric_tensorrt_llm[
                beam_idx].compute()
            for key in computed_metrics_tensorrt_llm.keys():
--- a/examples/models/core/mamba/convert_checkpoint.py
+++ b/examples/models/core/mamba/convert_checkpoint.py
@ -59,7 +59,7 @@ def parse_arguments():
        '--output_dir',
        type=Path,
        default='mamba_tllm_checkpoint',
-        help='The path to save the mamba TensorRT-LLM checkpoint')
+        help='The path to save the mamba TensorRT LLM checkpoint')
    parser.add_argument('--log_level', type=str, default='info')
    parser.add_argument(
        '--workers',
--- a/examples/models/core/mllama/convert_checkpoint.py
+++ b/examples/models/core/mllama/convert_checkpoint.py
@ -192,7 +192,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/multimodal/eval.py
+++ b/examples/models/core/multimodal/eval.py
@ -132,11 +132,11 @@ def load_hf_model(args):


 def load_trtllm_model(args):
-    profiler.start('load TensorRT-LLM model')
+    profiler.start('load TensorRT LLM model')
    trtllm_model = MultimodalModelRunner(args)
-    profiler.stop('load TensorRT-LLM model')
+    profiler.stop('load TensorRT LLM model')
    logger.info(
-        f'Load TensorRT-LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT-LLM model")} sec'
+        f'Load TensorRT LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT LLM model")} sec'
    )
    return trtllm_model

--- a/examples/models/core/nemotron_nas/convert_checkpoint.py
+++ b/examples/models/core/nemotron_nas/convert_checkpoint.py
@ -56,7 +56,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/phi/convert_checkpoint.py
+++ b/examples/models/core/phi/convert_checkpoint.py
@ -81,7 +81,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/qwen/convert_checkpoint.py
+++ b/examples/models/core/qwen/convert_checkpoint.py
@ -137,7 +137,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/examples/models/core/qwen2audio/run.py
+++ b/examples/models/core/qwen2audio/run.py
@ -316,7 +316,7 @@ class QWenInfer(object):
                                        stream.cuda_stream)
        stream.synchronize()
        audio_time = profiler.stop("Audio") / run_time
-        logger.info(f"TensorRT-LLM Audio latency: {audio_time:3f} sec ")
+        logger.info(f"TensorRT LLM Audio latency: {audio_time:3f} sec ")

        assert ok, "Runtime execution failed for audio session"

@ -567,7 +567,7 @@ class QWenInfer(object):
                            print(f'Output(beam: {beam}): "{output_text}"')
        logger.info(f"Input length={input_lengths[b]}")
        logger.info(f"Output length={output_ids.shape}")
-        logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ")
+        logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ")
        if isinstance(history, list):
            history.append({'role': 'assistant', 'content': output_text})
        return output_text, past_audio_features
--- a/examples/models/core/qwenvl/run.py
+++ b/examples/models/core/qwenvl/run.py
@ -418,7 +418,7 @@ class QWenInfer(object):
                            print(f'Output(beam: {beam}): "{output_text}"')
        logger.info(f"Input length={input_lengths[b]}")
        logger.info(f"Output length={output_ids.shape}")
-        logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ")
+        logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ")
        history.append((query, output_text))
        return output_text

@ -516,7 +516,7 @@ def vit_process(image_path, vit_engine_path, stream):
        ok = session_vit.run(visual_inputs, visual_outputs, stream)
    profiler.stop("ViT")
    Vit_time = profiler.elapsed_time_in_sec("ViT") / run_time
-    logger.info(f"TensorRT-LLM ViT latency: {Vit_time:3f} sec ")
+    logger.info(f"TensorRT LLM ViT latency: {Vit_time:3f} sec ")

    assert ok, "Runtime execution failed for vit session"

--- a/examples/models/core/recurrentgemma/convert_checkpoint.py
+++ b/examples/models/core/recurrentgemma/convert_checkpoint.py
@ -41,7 +41,7 @@ def parse_arguments():
        "--output_dir",
        type=Path,
        default="recurrentgemma_tllm_checkpoint",
-        help="The path to save the recurrentgemma TensorRT-LLM checkpoint")
+        help="The path to save the recurrentgemma TensorRT LLM checkpoint")
    parser.add_argument("--log_level", type=str, default="info")
    args = parser.parse_args()
    return args
@ -506,11 +506,11 @@ def main():
    )

    trt_llm_config_dict = trt_llm_config.to_dict()
-    print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}")
+    print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}")

    config_path = args.output_dir / "config.json"
    config_path.parent.mkdir(exist_ok=True, parents=True)
-    LOGGER.debug(f"Saving TensorRT-LLM configuration to {config_path}")
+    LOGGER.debug(f"Saving TensorRT LLM configuration to {config_path}")
    with config_path.open("w") as config_file:
        json.dump(trt_llm_config_dict, config_file, indent=4)

--- a/examples/models/core/vit/convert_checkpoint.py
+++ b/examples/models/core/vit/convert_checkpoint.py
@ -42,7 +42,7 @@ def parse_arguments():
    parser.add_argument('--output_dir',
                        type=str,
                        default='tllm_checkpoint',
-                        help='The path to save the TensorRT-LLM checkpoint')
+                        help='The path to save the TensorRT LLM checkpoint')
    parser.add_argument(
        '--workers',
        type=int,
--- a/Show More
+++ b/Show More