diff --git a/README.md b/README.md index 47b03b3351..a9e69f2f3a 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ TensorRT LLM * [08/01] Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization) ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md) -* [07/26] N-Gram Speculative Decoding in TensorRT‑LLM +* [07/26] N-Gram Speculative Decoding in TensorRT LLM ✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md) * [06/19] Disaggregated Serving in TensorRT LLM diff --git a/benchmarks/cpp/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp index 655feffe52..cc10a5b49e 100644 --- a/benchmarks/cpp/bertBenchmark.cpp +++ b/benchmarks/cpp/bertBenchmark.cpp @@ -135,7 +135,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da int main(int argc, char* argv[]) { - cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT."); + cxxopts::Options options("TensorRT LLM C++ Runtime Benchmark", "TensorRT LLM C++ Runtime Benchmark for BERT."); options.add_options()("h,help", "Print usage"); options.add_options()( "m,model", "Model name specified for engines.", cxxopts::value()->default_value("bert_base")); diff --git a/benchmarks/cpp/disaggServerBenchmark.cpp b/benchmarks/cpp/disaggServerBenchmark.cpp index ab00980275..89efae4539 100644 --- a/benchmarks/cpp/disaggServerBenchmark.cpp +++ b/benchmarks/cpp/disaggServerBenchmark.cpp @@ -1145,7 +1145,7 @@ void benchmark(std::vector const& contextEngineDirs, int main(int argc, char* argv[]) { - cxxopts::Options options("TensorRT-LLm DisaggServer Benchmark"); + cxxopts::Options options("TensorRT LLM DisaggServer Benchmark"); options.add_options()("h,help", "Print usage"); options.add_options()("context_engine_dirs", "Directories that store context engines,separator is a ,", cxxopts::value>()); diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp index a586610f15..7bfe10f0df 100644 --- a/benchmarks/cpp/gptManagerBenchmark.cpp +++ b/benchmarks/cpp/gptManagerBenchmark.cpp @@ -1055,7 +1055,7 @@ void benchmarkExecutor(std::optional const& decoderEngine int main(int argc, char* argv[]) { cxxopts::Options options( - "TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models."); + "TensorRT LLM BatchManager Benchmark", "TensorRT LLM BatchManager Benchmark for GPT and GPT-like models."); options.add_options()("h,help", "Print usage"); options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.", cxxopts::value()); diff --git a/cpp/include/tensorrt_llm/deep_gemm/compiler.cuh b/cpp/include/tensorrt_llm/deep_gemm/compiler.cuh index 8c00e939d5..8ec9f2ed42 100644 --- a/cpp/include/tensorrt_llm/deep_gemm/compiler.cuh +++ b/cpp/include/tensorrt_llm/deep_gemm/compiler.cuh @@ -217,7 +217,7 @@ std::vector getJitIncludeDirs() } else { - TLLM_LOG_WARNING("Failed to find TensorRT-LLM installation, DeepGEMM will be disabled."); + TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled."); } } return includeDirs; diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index 48ac605a3f..81340ef463 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -165,7 +165,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa { void* ret = dllGetSym(handle, name); TLLM_CHECK_WITH_INFO(ret != nullptr, - "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not " + "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not " "built with UCX support, please rebuild in UCX-enabled environment."); return ret; }; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h index 34aa05ddc4..b12dbf47f6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h @@ -105,7 +105,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm100(T* D, void const* A, void const* break; default: throw std::runtime_error( - "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM."); + "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM."); break; } } @@ -146,15 +146,15 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm100(T* D, void const* A, void const* B, occupancy); break; case tkc::CutlassTileConfigSM100::Undefined: - throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined."); + throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined."); break; case tkc::CutlassTileConfigSM100::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by " + "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by " "heuristic."); break; default: - throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM."); + throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM."); break; } } @@ -177,7 +177,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm120(T* D, void const* A, void const* break; default: throw std::runtime_error( - "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM."); + "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM."); break; } } @@ -205,16 +205,16 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm120(T* D, void const* A, void const* B, occupancy); break; case tkc::CutlassTileConfigSM120::Undefined: - throw std::runtime_error("[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined."); + throw std::runtime_error("[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined."); break; case tkc::CutlassTileConfigSM120::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by " + "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by " "heuristic."); break; default: throw std::runtime_error( - "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM."); + "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM."); break; } } @@ -257,7 +257,7 @@ size_t dispatchMXFP8xMXFP4GemmClusterShapeSm100(T* D, void const* A, void const* break; default: throw std::runtime_error( - "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM."); + "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM."); break; } } @@ -293,15 +293,15 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B, occupancy); break; case tkc::CutlassTileConfigSM100::Undefined: - throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined."); + throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined."); break; case tkc::CutlassTileConfigSM100::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by " + "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by " "heuristic."); break; default: - throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM."); + throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM."); break; } } @@ -338,7 +338,7 @@ size_t CutlassFp4GemmRunner::dispatchToArch(T* D, void const* A, else { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM"); + "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM"); } } else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4) @@ -356,13 +356,13 @@ size_t CutlassFp4GemmRunner::dispatchToArch(T* D, void const* A, else { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM"); + "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM"); } } else { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM"); + "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM"); } } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h index 129ff4f1a4..4191b337fe 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h @@ -93,7 +93,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const* int* occupancy) { throw std::runtime_error( - "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); + "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); } #else @@ -250,7 +250,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const* { std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " + std::to_string(mMaxSmemSize); - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); } /* // Return workspace size */ if (!A && !B && !D) @@ -261,28 +261,28 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const* { std::string errMsg("Requested workspace size insufficient. Required " + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); } auto can_implement = gemm.can_implement(args); if (can_implement != cutlass::Status::kSuccess) { std::string errMsg = "MXFP8xMXFP4 Gemm cutlass kernel will fail for params. Error: " + std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); } auto initStatus = gemm.initialize(args, workspace, stream); if (initStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to initialize cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(initStatus)); - throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg); } auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); if (runStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to run cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); - throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg); } return gemm.get_workspace_size(args); } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h index da7b303351..c6c794f582 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h @@ -107,7 +107,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void int* occupancy) \ { \ throw std::runtime_error( \ - "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \ + "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \ } #else @@ -268,7 +268,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void { \ std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \ + std::to_string(mMaxSmemSize); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ /* // Return workspace size */ \ if (!A && !B && !D) \ @@ -279,28 +279,28 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void { \ std::string errMsg("Requested workspace size insufficient. Required " \ + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ auto can_implement = gemm.can_implement(args); \ if (can_implement != cutlass::Status::kSuccess) \ { \ std::string errMsg = "FP4 Gemm cutlass kernel will fail for params. Error: " \ + std::string(cutlassGetStatusString(can_implement)); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ auto initStatus = gemm.initialize(args, workspace, stream); \ if (initStatus != cutlass::Status::kSuccess) \ { \ std::string errMsg \ = "Failed to initialize cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(initStatus)); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); \ if (runStatus != cutlass::Status::kSuccess) \ { \ std::string errMsg \ = "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ return gemm.get_workspace_size(args); \ } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h index 542357f0bd..d9eeda8476 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h @@ -69,7 +69,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B, int* occupancy) \ { \ throw std::runtime_error( \ - "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \ + "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \ } #else @@ -224,7 +224,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B, { \ std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \ + std::to_string(mMaxSmemSize); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ /* // Return workspace size */ \ if (!A && !B && !D) \ @@ -235,7 +235,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B, { \ std::string errMsg("Requested workspace size insufficient. Required " \ + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ auto initStatus = gemm.initialize(args, workspace); \ if (initStatus != cutlass::Status::kSuccess) \ @@ -243,14 +243,14 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B, auto cudaErrMsg = cudaGetErrorString(cudaGetLastError()); \ std::string errMsg = "Failed to initialize cutlass FP4 gemm. Error: " \ + std::string(cutlass::cutlassGetStatusString(initStatus)) + " " + cudaErrMsg; \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); \ if (runStatus != cutlass::Status::kSuccess) \ { \ std::string errMsg \ = "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlass::cutlassGetStatusString(runStatus)); \ - throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \ + throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \ } \ return gemm.get_workspace_size(args); \ } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h index 350e5177a1..cda815d294 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h @@ -75,7 +75,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg { std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " + std::to_string(mMaxSmemSize); - throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg); } // Return workspace size @@ -88,7 +88,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg { std::string errMsg("Requested workspace size insufficient. Required " + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); - throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg); } auto can_implement = gemm.can_implement(args); @@ -96,21 +96,21 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg { std::string errMsg = "fp8RowwiseGemm cutlass kernel not implemented given the params. Error: " + std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg); } auto initStatus = gemm.initialize(args, workspace, stream); if (initStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus)); - throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg); } auto runStatus = gemm.run(stream); if (runStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); - throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg); } return gemm.get_workspace_size(args); } @@ -210,7 +210,7 @@ size_t dispatchGemmConfigSm89(void* D, void const* A, void const* B, void const* break; default: throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for " "Fp8 Rowwise GEMM."); break; } @@ -299,16 +299,16 @@ size_t dispatchGemmToCutlassSm89(void* D, void const* A, void const* B, void con case tkc::CutlassTileConfig::Undefined: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined."); + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined."); break; case tkc::CutlassTileConfig::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have " "already been set by heuristic."); break; default: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for " "Fp8 Rowwise GEMM."); break; } @@ -379,7 +379,7 @@ size_t genericFp8RowwiseGemmKernelLauncherSm90(void* D, void const* A, void cons Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy); #else // COMPILE_HOPPER_TMA_GEMMS throw std::runtime_error( - "[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing " + "[TensorRT LLM Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing " "90-real as an arch to build_wheel.py."); #endif // COMPILE_HOPPER_TMA_GEMMS } @@ -418,7 +418,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const* break; default: throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for " "Fp8 Rowwise GEMM."); break; } @@ -468,16 +468,16 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con break; case tkc::CutlassTileConfigSM90::Undefined: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined."); + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined."); break; case tkc::CutlassTileConfigSM90::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have " "already been set by heuristic."); break; default: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for " "Fp8 Rowwise GEMM."); break; } @@ -517,7 +517,7 @@ size_t CutlassFp8RowwiseGemmRunner::dispatchToArch(void* D, void const* A, vo #endif { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS " "Fp8 Rowwise GEMM"); } return 0; @@ -585,7 +585,7 @@ std::vector CutlassFp8RowwiseGemmRunner::getConfigs() else { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS " + "[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS " "Fp8 Rowwise GEMM"); } return candidateConfigs; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h index 07ea2923fb..37b0593fbf 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -209,7 +209,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const { std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " + std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); } auto init_status = gemm.initialize(args, workspace, stream); @@ -217,7 +217,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const { std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(init_status)); - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); } auto run_status = gemm.run(stream); @@ -225,7 +225,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const { std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status)); - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); } } @@ -247,14 +247,14 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca // Multistage only supported on Ampere std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " + std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages); - throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg); } else if constexpr (Stages == 2 && arch::kMinComputeCapability >= 89) { // Multistage only supported on Ampere std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " + std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages); - throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg); } else if constexpr (cutlass::platform::is_same::value && arch::kMinComputeCapability < 89) @@ -262,7 +262,7 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca // FP8 activation type only supported on Ada+ GPUs std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " + std::to_string(arch::kMinComputeCapability) + " with activation type set to FP8"; - throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg); } else { @@ -301,7 +301,7 @@ void dispatch_gemm_config(ActivationType const* A, WeightType const* B, ScaleZer break; default: std::string err_msg = "dispatch_gemm_config does not support stages " + std::to_string(gemm_config.stages); - throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_config] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + err_msg); break; } } @@ -370,16 +370,16 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal C, m, n, k, group_size, gemm_config, workspace, workspace_bytes, stream, occupancy); break; case tkc::CutlassTileConfig::Undefined: - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined."); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined."); break; case tkc::CutlassTileConfig::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by " + "[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by " "heuristic."); break; default: throw std::runtime_error( - "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM."); + "[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM."); break; } } @@ -387,7 +387,7 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal { // This is not a limitation in CUTLASS. We just do not need to support this case. std::string err_msg = "The activation type must equal the scale, bias and output types on Ampere and earlier."; - throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_to_cutlass] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_to_cutlass] " + err_msg); } } @@ -439,7 +439,7 @@ void CutlassFpAIntBGemmRunner::value) { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs " + "[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs " "CUDA>=12.4"); } #endif @@ -459,7 +459,7 @@ void CutlassFpAIntBGemmRunner workspace_bytes) { - TLLM_LOG_ERROR("[TensorRT-LLm Error][fpA_intB Runner] given workspace size insufficient."); + TLLM_LOG_ERROR("[TensorRT LLM Error][fpA_intB Runner] given workspace size insufficient."); } auto can_implement = gemm.can_implement(args); @@ -258,7 +258,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " + std::string(cutlassGetStatusString(can_implement)); std::cout << err_msg << std::endl; - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); } auto init_status = gemm.initialize(args, workspace, stream); @@ -266,7 +266,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType { std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(init_status)); - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); } auto run_status = gemm.run(stream); @@ -274,13 +274,13 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType { std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status)); - throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); } } else { std::stringstream ss; - ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << "," + ss << "[TensorRT LLM Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << "," << (int64_t) cute::size<1>(CTAShape{}) << "," << (int64_t) cute::size<2>(CTAShape{}) << ") (" << (int64_t) cute::size<0>(ClusterShape{}) << "," << (int64_t) cute::size<1>(ClusterShape{}) << "," << (int64_t) cute::size<2>(ClusterShape{}) << ") not compiled with FAST_BUILD."; @@ -290,7 +290,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType #else // COMPILE_HOPPER_TMA_GEMMS throw std::runtime_error( - "[TensorRT-LLm Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch " + "[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch " "to build_wheel.py."); #endif // COMPILE_HOPPER_TMA_GEMMS } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h index d4fee178e0..ce175160a9 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h @@ -67,7 +67,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo { std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " + std::to_string(mMaxSmemSize); - throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg); } // Return workspace size @@ -80,7 +80,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo { std::string errMsg("Requested workspace size insufficient. Required " + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); - throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg); } auto can_implement = gemm.can_implement(args); @@ -88,21 +88,21 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo { std::string errMsg = "fusedGatedGemm cutlass kernel not implemented given the params. Error: " + std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg); } auto initStatus = gemm.initialize(args, workspace, stream); if (initStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus)); - throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg); } auto runStatus = gemm.run(stream); if (runStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); - throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg); } return gemm.get_workspace_size(args); } @@ -165,7 +165,7 @@ size_t genericGemmGatedKernelLauncherSm90(void* D, void const* A, void const* B, return typedGemmGatedKernelLauncher(Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy); #else // COMPILE_HOPPER_TMA_GEMMS throw std::runtime_error( - "[TensorRT-LLm Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real " + "[TensorRT LLM Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real " "as an arch to build_wheel.py."); #endif // COMPILE_HOPPER_TMA_GEMMS } @@ -204,7 +204,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const* break; default: throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused " + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused " "gated GEMM."); break; } @@ -255,17 +255,17 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con break; case tkc::CutlassTileConfigSM90::Undefined: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined."); + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined."); break; case tkc::CutlassTileConfigSM90::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have " + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have " "already been set by " "heuristic."); break; default: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused " + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused " "gated GEMM."); break; } @@ -302,14 +302,14 @@ size_t CutlassFusedGatedGemmRunner::dispatchToArch(void* D, void const* A, vo #endif { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused " + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused " "gated GEMM"); } } else { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused " + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused " "gated " "GEMM"); } @@ -340,7 +340,7 @@ std::vector CutlassFusedGatedGemmRunner::getConfigs() if (mSm != 90) { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused " + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused " "gated GEMM"); } tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param @@ -378,7 +378,7 @@ std::vector CutlassFusedGatedGemmRunner::getConfigs() else { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused " + "[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused " "gated " "GEMM"); } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h index c44caae0fa..4d4916d563 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h @@ -150,7 +150,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo { std::string errMsg = "int8gemm cutlass kernel will fail for params. Error: " + std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg); } auto initStatus = gemm.initialize(args, workspace, stream); @@ -158,7 +158,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo { std::string errMsg = "Failed to initialize cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(initStatus)); - throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg); } auto runStatus = gemm.run(stream); @@ -166,7 +166,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo { std::string errMsg = "Failed to run cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); - throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg); } } @@ -180,7 +180,7 @@ struct dispatchStages TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); std::string errMsg = "Cutlass int8 gemm. Not instantiates for arch " + std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages); - throw std::runtime_error("[TensorRT-LLM Error][dispatchStages::dispatch] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][dispatchStages::dispatch] " + errMsg); } }; @@ -248,7 +248,7 @@ void dispatchGemmConfig(int8_t const* A, int8_t const* B, tk::QuantMode quantOpt break; default: std::string errMsg = "dispatchGemmConfig does not support stages " + std::to_string(gemmConfig.stages); - throw std::runtime_error("[TensorRT-LLM Error][dispatch_gemm_config] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + errMsg); break; } } @@ -288,16 +288,16 @@ void dispatchGemmToCutlass(int8_t const* A, int8_t const* B, tk::QuantMode quant quantOption, alphaCol, alphaRow, C, m, n, k, gemmConfig, workspace, workspaceBytes, stream, occupancy); break; case tkc::CutlassTileConfig::Undefined: - throw std::runtime_error("[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined."); + throw std::runtime_error("[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined."); break; case tkc::CutlassTileConfig::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by " + "[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by " "heuristic."); break; default: throw std::runtime_error( - "[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM."); + "[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM."); break; } } @@ -342,7 +342,7 @@ void CutlassInt8GemmRunner::dispatchToArch(int8_t const* A, int8_t const* B, else { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM"); + "[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM"); } } @@ -364,7 +364,7 @@ std::vector CutlassInt8GemmRunner::getConfigs() const if (mSm <= 70) { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM"); + "[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM"); } std::vector candidateConfigs = get_candidate_configs(mSm, SPLIT_K_LIMIT, config_type_param); diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h index 42a6beaf9d..2395650223 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h @@ -195,7 +195,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f { std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " + std::to_string(mMaxSmemSize); - throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); } // Return workspace size @@ -208,7 +208,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f { std::string errMsg("Requested workspace size insufficient. Required " + std::to_string(gemm.get_workspace_size(arguments)) + ", got " + std::to_string(workspaceBytes)); - throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); } auto can_implement = gemm.can_implement(arguments); @@ -216,26 +216,26 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f { std::string errMsg = "Fp8LowLatencyGemm cutlass kernel not implemented given the params. Error: " + std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); } auto initStatus = gemm.initialize(arguments, workspacePtr); if (initStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus)); - throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); } auto runStatus = gemm.run(stream, nullptr, pdl_overlap_ratio >= 0); if (runStatus != cutlass::Status::kSuccess) { std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); - throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); + throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg); } return gemm.get_workspace_size(arguments); #else // COMPILE_HOPPER_TMA_GEMMS throw std::runtime_error( - "[TensorRT-LLm Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by " + "[TensorRT LLM Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by " "passing 90-real as an arch to build_wheel.py."); #endif // COMPILE_HOPPER_TMA_GEMMS } @@ -264,7 +264,7 @@ size_t dispatchLowLatencyGemmCultassKernelSchedSm90(__nv_fp8_e4m3 const* A, __nv break; default: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config " + "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config " "is " "invalid for low latency fp8 gemm"); break; @@ -300,7 +300,7 @@ size_t dispatchLowLatencyGemmClusterShapeSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e default: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is " + "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is " "invalid for low latency fp8 gemm"); break; } @@ -369,19 +369,19 @@ size_t dispatchLowLatencyGemmToCutlassSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e4m3 break; case tkc::CutlassTileConfigSM90::Undefined: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config " + "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config " "undefined."); break; case tkc::CutlassTileConfigSM90::ChooseWithHeuristic: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config " + "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config " "should have " "already been set by " "heuristic."); break; default: throw std::runtime_error( - "[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is " + "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is " "invalid for low latency fp8 gemm"); break; } @@ -413,7 +413,7 @@ size_t CutlassLowLatencyFp8GemmRunner::dispatchToArch(__nv_fp8_e4m3 const* A, { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low " + "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low " "Latency Gemm"); } return 0; @@ -499,7 +499,7 @@ std::vector CutlassLowLatencyFp8GemmRunner::getConfigs() const if (mSm != 90) { throw std::runtime_error( - "[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low " + "[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low " "Latency GEMM"); } tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index 53bd7bc33c..2720bf4232 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -235,12 +235,12 @@ struct BatchedGemmData void const* mPtrBias{nullptr}; // The output tensor scaling factor for MxFp{4,8}, Fp8 and NvFp4 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [B]. float const* mPtrScaleC{nullptr}; // The output gate scale for MxFp{4,8} and NvFp4 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [B]. float const* mPtrScaleGate{nullptr}; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h index 0ebe9a94c8..56ea4561c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h @@ -214,12 +214,12 @@ struct KernelParams // ScaleC = SEncC // // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [B]. One scaling factor per tensor in batch. float const* ptrScaleC{nullptr}; // The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [B]. One scaling factor per tensor in batch. float const* ptrScaleGate{nullptr}; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h index 459d831e0b..f17d691002 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h @@ -143,7 +143,7 @@ struct GemmData void const* mPtrPerTokenSfB{nullptr}; // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [1]. void* mPtrScaleC{nullptr}; }; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h index 142e9728dc..0e0a02d16b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h @@ -204,7 +204,7 @@ struct KernelParams void* ptrSfC; // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [1]. float const* ptrScaleC; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h index a8087dc59a..a3f83cead9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h @@ -133,11 +133,11 @@ struct GemmGatedActData void const* mPtrPerTokenSfB{nullptr}; // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [1]. void const* mPtrScaleC{nullptr}; // The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [1]. void const* mPtrScaleGate{nullptr}; }; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h index 4a7bde2a17..b9bdd2c3a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h @@ -290,7 +290,7 @@ struct KernelParams // y = act(ptrScaleGate[0] * y1) * (ptrScaleC[0] * y2) // // The output tensor scaling factor for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. + // TensorRT LLM API requires a scaling factor on the device. // Shape is [1]. float const* ptrScaleC; // The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization. diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp index 89cfa72211..7961dac599 100644 --- a/cpp/tensorrt_llm/nanobind/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/bindings.cpp @@ -73,7 +73,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector const& con NB_MODULE(TRTLLM_NB_MODULE, m) { - m.doc() = "TensorRT-LLM Python bindings for C++ runtime"; + m.doc() = "TensorRT LLM Python bindings for C++ runtime"; m.attr("binding_type") = "nanobind"; nb::set_leak_warnings(false); diff --git a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp index e2fab9044c..4e6b29be99 100644 --- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp @@ -125,7 +125,7 @@ BertAttentionPlugin::BertAttentionPlugin(void const* data, size_t length) TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/cudaStreamPlugin/cudaStreamPlugin.cpp b/cpp/tensorrt_llm/plugins/cudaStreamPlugin/cudaStreamPlugin.cpp index bf3bdc0297..802e828c92 100644 --- a/cpp/tensorrt_llm/plugins/cudaStreamPlugin/cudaStreamPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/cudaStreamPlugin/cudaStreamPlugin.cpp @@ -48,7 +48,7 @@ CudaStreamPlugin::CudaStreamPlugin(void const* data, size_t length) TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp index 9aa660dda5..899c93855b 100644 --- a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleDecodeDraftTokensPlugin.cpp @@ -58,7 +58,7 @@ EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin(void const* data, siz read(d, mTopKSampling); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", static_cast(length), static_cast(d - a)); } diff --git a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp index a2ebf2ee55..5fb30f5837 100644 --- a/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/eaglePlugin/eagleSampleAndAcceptDraftTokensPlugin.cpp @@ -52,7 +52,7 @@ EagleSampleAndAcceptDraftTokensPlugin::EagleSampleAndAcceptDraftTokensPlugin(voi read(d, mDtype); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/fusedLayernormPlugin/fusedLayernormPlugin.cpp b/cpp/tensorrt_llm/plugins/fusedLayernormPlugin/fusedLayernormPlugin.cpp index 030895123a..541afdadc4 100644 --- a/cpp/tensorrt_llm/plugins/fusedLayernormPlugin/fusedLayernormPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/fusedLayernormPlugin/fusedLayernormPlugin.cpp @@ -47,7 +47,7 @@ FusedLayernormPlugin::FusedLayernormPlugin(void const* data, size_t length) read(d, mType); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp index 4cec38b046..08ee2af554 100644 --- a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp +++ b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp @@ -203,7 +203,7 @@ static GemmAllReducePluginOptions deserializeOptions(void const*& data, size_t l TLLM_CHECK_WITH_INFO(end == begin + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (end - begin)); diff --git a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp index ac20e2cfd3..9e06ad01d1 100644 --- a/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/gemmPlugin/gemmPlugin.cpp @@ -179,7 +179,7 @@ GemmPlugin::GemmPlugin(void const* data, size_t length, GemmPlugin::PluginProfil TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp index 98e59c8fdd..fa160070e4 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp +++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp @@ -183,7 +183,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng } TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); TLLM_CHECK_WITH_INFO((smVersion() >= 80) || (mType != nvinfer1::DataType::kBF16), diff --git a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp index 2174fd5332..109010e7a9 100644 --- a/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/identityPlugin/identityPlugin.cpp @@ -35,7 +35,7 @@ IdentityPlugin::IdentityPlugin(void const* data, size_t length) char const *d = reinterpret_cast(data), *a = d; TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp index f397044ad6..02a40a00c9 100644 --- a/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/layernormQuantizationPlugin/layernormQuantizationPlugin.cpp @@ -61,7 +61,7 @@ LayernormQuantizationPlugin::LayernormQuantizationPlugin(void const* data, size_ read(d, mOutputType); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp index 3016c3c945..e4d26f9e5e 100644 --- a/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/lookupPlugin/lookupPlugin.cpp @@ -48,7 +48,7 @@ LookupPlugin::LookupPlugin(void const* data, size_t length) read(d, mRank); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp index 763ca2f069..7a7d925a74 100644 --- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp @@ -78,7 +78,7 @@ LoraPlugin::LoraPlugin(void const* data, size_t length, LoraPlugin::PluginProfil TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/lowLatencyGemmPlugin/lowLatencyGemmPlugin.cpp b/cpp/tensorrt_llm/plugins/lowLatencyGemmPlugin/lowLatencyGemmPlugin.cpp index fd412bc0d4..6165d6210f 100644 --- a/cpp/tensorrt_llm/plugins/lowLatencyGemmPlugin/lowLatencyGemmPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/lowLatencyGemmPlugin/lowLatencyGemmPlugin.cpp @@ -124,7 +124,7 @@ LowLatencyGemmPlugin::LowLatencyGemmPlugin(void const* data, size_t length, Plug mPluginProfiler->deserialize(d, mDims, mGemmId); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/lowLatencyGemmSwigluPlugin/lowLatencyGemmSwigluPlugin.cpp b/cpp/tensorrt_llm/plugins/lowLatencyGemmSwigluPlugin/lowLatencyGemmSwigluPlugin.cpp index d2e8d370ec..a1aa11c2f1 100644 --- a/cpp/tensorrt_llm/plugins/lowLatencyGemmSwigluPlugin/lowLatencyGemmSwigluPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/lowLatencyGemmSwigluPlugin/lowLatencyGemmSwigluPlugin.cpp @@ -159,7 +159,7 @@ LowLatencyGemmSwigluPlugin::LowLatencyGemmSwigluPlugin( mPluginProfiler->deserialize(d, mDims, mGemmId); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp index 6db0e4a382..48b7c23b82 100644 --- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp @@ -175,7 +175,7 @@ MixtureOfExpertsPlugin::MixtureOfExpertsPlugin(void const* data, size_t length, TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp index 5c8a35b808..4825dd51bb 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allgatherPlugin.cpp @@ -48,7 +48,7 @@ AllgatherPlugin::AllgatherPlugin(void const* data, size_t length) } TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp index 89e05fb61a..4241cf8d85 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp @@ -77,7 +77,7 @@ AllreducePlugin::AllreducePlugin(void const* data, size_t length) } TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); check(); diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp index b6c3ba87c1..089ed31175 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/recvPlugin.cpp @@ -45,7 +45,7 @@ RecvPlugin::RecvPlugin(void const* data, size_t length) read(d, mSrcRank); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp index 09263d1498..fe17c44fc4 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/reduceScatterPlugin.cpp @@ -48,7 +48,7 @@ ReduceScatterPlugin::ReduceScatterPlugin(void const* data, size_t length) } TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp index b1ca7165a5..81d66aa821 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/sendPlugin.cpp @@ -46,7 +46,7 @@ SendPlugin::SendPlugin(void const* data, size_t length) read(d, mTgtRank); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/qserveGemmPlugin/qserveGemmPlugin.cpp b/cpp/tensorrt_llm/plugins/qserveGemmPlugin/qserveGemmPlugin.cpp index 1ba984cbed..166f1cc32c 100644 --- a/cpp/tensorrt_llm/plugins/qserveGemmPlugin/qserveGemmPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/qserveGemmPlugin/qserveGemmPlugin.cpp @@ -64,7 +64,7 @@ QServeGemmPlugin::QServeGemmPlugin(void const* data, size_t length) TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp index 678c6eaa1a..23d0b80390 100644 --- a/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/quantizePerTokenPlugin/quantizePerTokenPlugin.cpp @@ -51,7 +51,7 @@ QuantizePerTokenPlugin::QuantizePerTokenPlugin(void const* data, size_t length) read(d, mSumPerToken); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp index 7d4bbbb65d..cacb32b809 100644 --- a/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp @@ -35,7 +35,7 @@ QuantizeTensorPlugin::QuantizeTensorPlugin(void const* data, size_t length) char const *d = reinterpret_cast(data), *a = d; TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp b/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp index b75e7cb066..7f88ca2e85 100644 --- a/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp +++ b/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp @@ -41,7 +41,7 @@ QuantizeToFP4Plugin::QuantizeToFP4Plugin(void const* data, size_t length) char const *d = reinterpret_cast(data), *a = d; TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp index d8e2fbe595..16d0bf2dc3 100644 --- a/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/rmsnormQuantizationPlugin/rmsnormQuantizationPlugin.cpp @@ -58,7 +58,7 @@ RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(void const* data, size_t le read(d, mOutputType); TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp index 4235c808a4..9470a879a0 100644 --- a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp @@ -98,7 +98,7 @@ SmoothQuantGemmPlugin::SmoothQuantGemmPlugin( TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp index c9b779f4f3..85f0cf0112 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/weightOnlyGroupwiseQuantMatmulPlugin/weightOnlyGroupwiseQuantMatmulPlugin.cpp @@ -148,7 +148,7 @@ WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin( TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp index 115f8c2a19..f3ed07fafa 100644 --- a/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/weightOnlyQuantMatmulPlugin/weightOnlyQuantMatmulPlugin.cpp @@ -126,7 +126,7 @@ WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin( TLLM_CHECK_WITH_INFO(d == a + length, "Expected length (%d) != real length (%d). This is often " - "caused by using different TensorRT-LLM version to build " + "caused by using different TensorRT LLM version to build " "engine and run engine.", (int) length, (int) (d - a)); } diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp index 216baaa362..07c4943e79 100644 --- a/cpp/tensorrt_llm/pybind/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/bindings.cpp @@ -67,7 +67,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector const& con PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) { - m.doc() = "TensorRT-LLM Python bindings for C++ runtime"; + m.doc() = "TensorRT LLM Python bindings for C++ runtime"; m.attr("binding_type") = "pybind"; // Create MpiComm binding first since it's used in the executor bindings diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h index d254907267..dfef06d8b4 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.h +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h @@ -56,7 +56,7 @@ public: } /// @brief If multiple TensorRT optimization profiles are built in the engine, this function selects the - /// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT-LLM only split + /// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT LLM only split /// multiple profiles on the num_tokens dimension, hence the profile index is selected based on which profile /// handles the actual num_tokens /// @return The index of the selected TensorRT optimization profile diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp index af916359d0..f6a7f2a139 100644 --- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp +++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp @@ -330,7 +330,7 @@ protected: { void* ret = dllGetSym(handle, name); TLLM_CHECK_WITH_INFO(ret != nullptr, - "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not " + "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not " "built with UCX support, please rebuild in UCX-enabled environment."); return ret; }; @@ -732,7 +732,7 @@ protected: { void* ret = dllGetSym(handle, name); TLLM_CHECK_WITH_INFO(ret != nullptr, - "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not " + "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not " "built with UCX support, please rebuild in UCX-enabled environment."); return ret; }; diff --git a/cpp/tests/unit_tests/executor/ucxCommTest.cpp b/cpp/tests/unit_tests/executor/ucxCommTest.cpp index 5895ac0947..08b5c0f7fa 100644 --- a/cpp/tests/unit_tests/executor/ucxCommTest.cpp +++ b/cpp/tests/unit_tests/executor/ucxCommTest.cpp @@ -70,7 +70,7 @@ std::unique_ptr makeOneUcxConnectionManager( void* ret = dllGetSym(handle, name); TLLM_CHECK_WITH_INFO(ret != nullptr, - "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not " + "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not " "built with UCX support, please rebuild in UCX-enabled environment."); return ret; }; diff --git a/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluKernelTestSm90Fp8.cu b/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluKernelTestSm90Fp8.cu index 5aa2f0a518..bdbec84ec8 100644 --- a/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluKernelTestSm90Fp8.cu +++ b/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluKernelTestSm90Fp8.cu @@ -243,7 +243,7 @@ Result run(std::string description, Options& options, Buffers& buffers) auto can_implement = device_gemm.can_implement(arguments); if (can_implement != cutlass::Status::kSuccess) { - throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner]"); + throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner]"); } // Initialize CUTLASS kernel with arguments and workspace pointer @@ -481,7 +481,7 @@ int main(int argc, char const** argv) #ifdef COMPILE_HOPPER_TMA_GEMMS Result hopperFp8 = run(std::string("Hopper fp8 swiglu"), options, buffers); #else // COMPILE_HOPPER_TMA_GEMMS - std::cout << "[TensorRT-LLm Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by " + std::cout << "[TensorRT LLM Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by " "passing 90-real as an arch to build_wheel.py." << std::endl; #endif // COMPILE_HOPPER_TMA_GEMMS diff --git a/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluRunnerTest.cu b/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluRunnerTest.cu index 3db0d1a4cc..872a7deeee 100644 --- a/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluRunnerTest.cu +++ b/cpp/tests/unit_tests/kernels/fused_gated_gemm/gemmSwigluRunnerTest.cu @@ -338,7 +338,7 @@ TEST(GemmSwigluRunner, Sm90FP8) Result hopperFp8 = run("SM90 FP8 WS GEMM", options, buffers); EXPECT_TRUE(hopperFp8.passed); #else // COMPILE_HOPPER_TMA_GEMMS - std::cout << "[TensorRT-LLm Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing " + std::cout << "[TensorRT LLM Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing " "90-real as an arch to build_wheel.py." << std::endl; #endif // COMPILE_HOPPER_TMA_GEMMS diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py index b2aa0baf2a..510b281a70 100755 --- a/examples/apps/fastapi_server.py +++ b/examples/apps/fastapi_server.py @@ -1,6 +1,6 @@ """ NOTE: This FastAPI-based server is only an example for demonstrating the usage -of TensorRT-LLM LLM API. It is not intended for production use. +of TensorRT LLM LLM API. It is not intended for production use. For production, use the `trtllm-serve` command. The server exposes OpenAI compatible API endpoints. """ diff --git a/examples/cpp_library/main.cpp b/examples/cpp_library/main.cpp index 20372f2270..7613a75a14 100644 --- a/examples/cpp_library/main.cpp +++ b/examples/cpp_library/main.cpp @@ -28,11 +28,11 @@ int main(int argc, char* argv[]) void log(nvinfer1::ILogger::Severity severity, char const* msg) noexcept override { if (severity <= nvinfer1::ILogger::Severity::kERROR) - std::cerr << "[TensorRT-LLM ERR]: " << msg << std::endl; + std::cerr << "[TensorRT LLM ERR]: " << msg << std::endl; else if (severity == nvinfer1::ILogger::Severity::kWARNING) - std::cerr << "[TensorRT-LLM WARNING]: " << msg << std::endl; + std::cerr << "[TensorRT LLM WARNING]: " << msg << std::endl; else - std::cout << "[TensorRT-LLM LOG]: " << msg << std::endl; + std::cout << "[TensorRT LLM LOG]: " << msg << std::endl; } }; diff --git a/examples/eagle/convert_checkpoint.py b/examples/eagle/convert_checkpoint.py index 1632e1e218..4b6e2d0e12 100644 --- a/examples/eagle/convert_checkpoint.py +++ b/examples/eagle/convert_checkpoint.py @@ -144,7 +144,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/generate_checkpoint_config.py b/examples/generate_checkpoint_config.py index 91252e4cfa..a11104eeba 100644 --- a/examples/generate_checkpoint_config.py +++ b/examples/generate_checkpoint_config.py @@ -12,7 +12,7 @@ def parse_arguments(): '--output_path', type=str, default='config.json', - help='The path to save the TensorRT-LLM checkpoint config.json file') + help='The path to save the TensorRT LLM checkpoint config.json file') parser.add_argument('--architecture', type=str, default='GPTForCausalLM') parser.add_argument('--dtype', type=str, diff --git a/examples/llm-api/llm_mgmn_llm_distributed.sh b/examples/llm-api/llm_mgmn_llm_distributed.sh index cd89de7b5d..bc6b6e16a6 100644 --- a/examples/llm-api/llm_mgmn_llm_distributed.sh +++ b/examples/llm-api/llm_mgmn_llm_distributed.sh @@ -29,7 +29,7 @@ # MOUNT_DIR: the directory to mount in the container # MOUNT_DEST: the destination directory in the container # WORKDIR: the working directory in the container -# SOURCE_ROOT: the path to the TensorRT-LLM source +# SOURCE_ROOT: the path to the TensorRT LLM source # PROLOGUE: the prologue to run before the script # LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is # not supported in Slurm mode, you need to download the model and put it in diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh index 5169c00ad3..43c126368d 100644 --- a/examples/llm-api/llm_mgmn_trtllm_bench.sh +++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh @@ -29,7 +29,7 @@ # MOUNT_DIR: the directory to mount in the container # MOUNT_DEST: the destination directory in the container # WORKDIR: the working directory in the container -# SOURCE_ROOT: the path to the TensorRT-LLM source +# SOURCE_ROOT: the path to the TensorRT LLM source # PROLOGUE: the prologue to run before the script # LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is # not supported in Slurm mode, you need to download the model and put it in diff --git a/examples/llm-api/llm_mgmn_trtllm_serve.sh b/examples/llm-api/llm_mgmn_trtllm_serve.sh index 05d9df88ce..a0cd8ce11f 100644 --- a/examples/llm-api/llm_mgmn_trtllm_serve.sh +++ b/examples/llm-api/llm_mgmn_trtllm_serve.sh @@ -29,7 +29,7 @@ # MOUNT_DIR: the directory to mount in the container # MOUNT_DEST: the destination directory in the container # WORKDIR: the working directory in the container -# SOURCE_ROOT: the path to the TensorRT-LLM source +# SOURCE_ROOT: the path to the TensorRT LLM source # PROLOGUE: the prologue to run before the script # LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is # not supported in Slurm mode, you need to download the model and put it in diff --git a/examples/llm-api/llm_sampling.py b/examples/llm-api/llm_sampling.py index 679ef17fc9..dcaeb552b7 100644 --- a/examples/llm-api/llm_sampling.py +++ b/examples/llm-api/llm_sampling.py @@ -161,7 +161,7 @@ def demonstrate_with_logprobs(prompt: str): def run_all_demonstrations(model_path: Optional[str] = None): """Run all sampling demonstrations.""" - print("🚀 TensorRT-LLM Sampling Techniques Showcase") + print("🚀 TensorRT LLM Sampling Techniques Showcase") print("=" * 50) # Use the first prompt for most demonstrations diff --git a/examples/medusa/convert_checkpoint.py b/examples/medusa/convert_checkpoint.py index fac23ddfd7..0a15e842bf 100644 --- a/examples/medusa/convert_checkpoint.py +++ b/examples/medusa/convert_checkpoint.py @@ -161,7 +161,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/baichuan/convert_checkpoint.py b/examples/models/contrib/baichuan/convert_checkpoint.py index 840394a93d..2a35bfbe48 100644 --- a/examples/models/contrib/baichuan/convert_checkpoint.py +++ b/examples/models/contrib/baichuan/convert_checkpoint.py @@ -53,7 +53,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/bloom/convert_checkpoint.py b/examples/models/contrib/bloom/convert_checkpoint.py index 7a562e0b4b..24549fa40b 100644 --- a/examples/models/contrib/bloom/convert_checkpoint.py +++ b/examples/models/contrib/bloom/convert_checkpoint.py @@ -156,7 +156,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=Path, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--calib_dataset', type=str, diff --git a/examples/models/contrib/cogvlm/convert_checkpoint.py b/examples/models/contrib/cogvlm/convert_checkpoint.py index 1573e54814..da99d8f006 100644 --- a/examples/models/contrib/cogvlm/convert_checkpoint.py +++ b/examples/models/contrib/cogvlm/convert_checkpoint.py @@ -190,7 +190,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/dbrx/convert_checkpoint.py b/examples/models/contrib/dbrx/convert_checkpoint.py index cc463c76e4..f9f8ac9cea 100644 --- a/examples/models/contrib/dbrx/convert_checkpoint.py +++ b/examples/models/contrib/dbrx/convert_checkpoint.py @@ -90,7 +90,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/deepseek_v1/convert_checkpoint.py b/examples/models/contrib/deepseek_v1/convert_checkpoint.py index 024eae4e22..50751f4260 100644 --- a/examples/models/contrib/deepseek_v1/convert_checkpoint.py +++ b/examples/models/contrib/deepseek_v1/convert_checkpoint.py @@ -79,7 +79,7 @@ def parse_arguments(): type=str, default='trtllm_checkpoint', required=True, - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/deepseek_v2/convert_checkpoint.py b/examples/models/contrib/deepseek_v2/convert_checkpoint.py index 1baa4b7d06..39ef281f04 100755 --- a/examples/models/contrib/deepseek_v2/convert_checkpoint.py +++ b/examples/models/contrib/deepseek_v2/convert_checkpoint.py @@ -79,7 +79,7 @@ def parse_arguments(): type=str, default='trtllm_checkpoint', required=True, - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/dit/convert_checkpoint.py b/examples/models/contrib/dit/convert_checkpoint.py index 1e00295867..16b49ca312 100644 --- a/examples/models/contrib/dit/convert_checkpoint.py +++ b/examples/models/contrib/dit/convert_checkpoint.py @@ -87,7 +87,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument('--input_size', type=int, default=64, diff --git a/examples/models/contrib/falcon/convert_checkpoint.py b/examples/models/contrib/falcon/convert_checkpoint.py index 5012a55687..03584f9d7f 100644 --- a/examples/models/contrib/falcon/convert_checkpoint.py +++ b/examples/models/contrib/falcon/convert_checkpoint.py @@ -74,7 +74,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/gptj/convert_checkpoint.py b/examples/models/contrib/gptj/convert_checkpoint.py index 1f701c5bf5..58749ac5d8 100644 --- a/examples/models/contrib/gptj/convert_checkpoint.py +++ b/examples/models/contrib/gptj/convert_checkpoint.py @@ -61,7 +61,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/gptneox/convert_checkpoint.py b/examples/models/contrib/gptneox/convert_checkpoint.py index 506874a6c2..c78cc4824d 100644 --- a/examples/models/contrib/gptneox/convert_checkpoint.py +++ b/examples/models/contrib/gptneox/convert_checkpoint.py @@ -76,7 +76,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/grok/convert_checkpoint.py b/examples/models/contrib/grok/convert_checkpoint.py index d28d13d3eb..72adbbd5a0 100644 --- a/examples/models/contrib/grok/convert_checkpoint.py +++ b/examples/models/contrib/grok/convert_checkpoint.py @@ -110,7 +110,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/mmdit/convert_checkpoint.py b/examples/models/contrib/mmdit/convert_checkpoint.py index 824c994248..e2637a4014 100644 --- a/examples/models/contrib/mmdit/convert_checkpoint.py +++ b/examples/models/contrib/mmdit/convert_checkpoint.py @@ -37,7 +37,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/mpt/convert_checkpoint.py b/examples/models/contrib/mpt/convert_checkpoint.py index e096be3cdf..be73f9f760 100644 --- a/examples/models/contrib/mpt/convert_checkpoint.py +++ b/examples/models/contrib/mpt/convert_checkpoint.py @@ -124,7 +124,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/opt/convert_checkpoint.py b/examples/models/contrib/opt/convert_checkpoint.py index b5f2654dcd..1c7dcd08e8 100644 --- a/examples/models/contrib/opt/convert_checkpoint.py +++ b/examples/models/contrib/opt/convert_checkpoint.py @@ -76,7 +76,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/contrib/stdit/convert_checkpoint.py b/examples/models/contrib/stdit/convert_checkpoint.py index 0640d2fa4b..f62d2366cc 100644 --- a/examples/models/contrib/stdit/convert_checkpoint.py +++ b/examples/models/contrib/stdit/convert_checkpoint.py @@ -44,7 +44,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument('--caption_channels', type=int, default=4096, diff --git a/examples/models/core/bert/convert_checkpoint.py b/examples/models/core/bert/convert_checkpoint.py index ed98c27686..fded1b8287 100644 --- a/examples/models/core/bert/convert_checkpoint.py +++ b/examples/models/core/bert/convert_checkpoint.py @@ -47,7 +47,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/commandr/convert_checkpoint.py b/examples/models/core/commandr/convert_checkpoint.py index 6a4d08904f..b8cd903bd0 100644 --- a/examples/models/core/commandr/convert_checkpoint.py +++ b/examples/models/core/commandr/convert_checkpoint.py @@ -79,7 +79,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/gemma/convert_checkpoint.py b/examples/models/core/gemma/convert_checkpoint.py index a79105c166..35ec3959ee 100644 --- a/examples/models/core/gemma/convert_checkpoint.py +++ b/examples/models/core/gemma/convert_checkpoint.py @@ -260,7 +260,7 @@ def main() -> None: trt_llm_config.query_pre_attn_scalar = ckpt_config.query_pre_attn_scalar trt_llm_config_dict = trt_llm_config.to_dict() - print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}") + print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}") save_config(trt_llm_config, output_dir=args.output_model_dir, log=True) diff --git a/examples/models/core/glm-4-9b/convert_checkpoint.py b/examples/models/core/glm-4-9b/convert_checkpoint.py index 648567952f..e7c8dd2905 100644 --- a/examples/models/core/glm-4-9b/convert_checkpoint.py +++ b/examples/models/core/glm-4-9b/convert_checkpoint.py @@ -127,7 +127,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/gpt/convert_checkpoint.py b/examples/models/core/gpt/convert_checkpoint.py index 84fc17206c..fbc060d7db 100644 --- a/examples/models/core/gpt/convert_checkpoint.py +++ b/examples/models/core/gpt/convert_checkpoint.py @@ -132,7 +132,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/internlm2/convert_checkpoint.py b/examples/models/core/internlm2/convert_checkpoint.py index eb078400b7..2665ee6f3e 100644 --- a/examples/models/core/internlm2/convert_checkpoint.py +++ b/examples/models/core/internlm2/convert_checkpoint.py @@ -71,7 +71,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/llama/convert_checkpoint.py b/examples/models/core/llama/convert_checkpoint.py index e4858c815b..19ea7a769f 100644 --- a/examples/models/core/llama/convert_checkpoint.py +++ b/examples/models/core/llama/convert_checkpoint.py @@ -227,7 +227,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/llama/summarize_long.py b/examples/models/core/llama/summarize_long.py index cee2e07fdd..215ec93d56 100644 --- a/examples/models/core/llama/summarize_long.py +++ b/examples/models/core/llama/summarize_long.py @@ -51,7 +51,7 @@ def parse_args(): '--max_input_len', type=int, default=6400, - help='The max input length TensorRT-LLM engine was built with') + help='The max input length TensorRT LLM engine was built with') parser.add_argument('--log_level', type=str, default='info') parser.add_argument('--max_ite', type=int, default=5) parser.add_argument( @@ -392,7 +392,7 @@ def main(args): references=[hf_summary[ite][beam_idx][batch_idx]]) for beam_idx in range(args.num_beams): - logger.info(f"TensorRT-LLM beam {beam_idx} result") + logger.info(f"TensorRT LLM beam {beam_idx} result") computed_metrics_tensorrt_llm = metric_tensorrt_llm[ beam_idx].compute() for key in computed_metrics_tensorrt_llm.keys(): diff --git a/examples/models/core/mamba/convert_checkpoint.py b/examples/models/core/mamba/convert_checkpoint.py index 04c743324f..0afaf10b90 100644 --- a/examples/models/core/mamba/convert_checkpoint.py +++ b/examples/models/core/mamba/convert_checkpoint.py @@ -59,7 +59,7 @@ def parse_arguments(): '--output_dir', type=Path, default='mamba_tllm_checkpoint', - help='The path to save the mamba TensorRT-LLM checkpoint') + help='The path to save the mamba TensorRT LLM checkpoint') parser.add_argument('--log_level', type=str, default='info') parser.add_argument( '--workers', diff --git a/examples/models/core/mllama/convert_checkpoint.py b/examples/models/core/mllama/convert_checkpoint.py index fe8520d5ac..be0a054ad5 100644 --- a/examples/models/core/mllama/convert_checkpoint.py +++ b/examples/models/core/mllama/convert_checkpoint.py @@ -192,7 +192,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/multimodal/eval.py b/examples/models/core/multimodal/eval.py index 01f49ce20e..4a77ac9aa5 100644 --- a/examples/models/core/multimodal/eval.py +++ b/examples/models/core/multimodal/eval.py @@ -132,11 +132,11 @@ def load_hf_model(args): def load_trtllm_model(args): - profiler.start('load TensorRT-LLM model') + profiler.start('load TensorRT LLM model') trtllm_model = MultimodalModelRunner(args) - profiler.stop('load TensorRT-LLM model') + profiler.stop('load TensorRT LLM model') logger.info( - f'Load TensorRT-LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT-LLM model")} sec' + f'Load TensorRT LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT LLM model")} sec' ) return trtllm_model diff --git a/examples/models/core/nemotron_nas/convert_checkpoint.py b/examples/models/core/nemotron_nas/convert_checkpoint.py index eeedd8855d..c59b5b803d 100644 --- a/examples/models/core/nemotron_nas/convert_checkpoint.py +++ b/examples/models/core/nemotron_nas/convert_checkpoint.py @@ -56,7 +56,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/phi/convert_checkpoint.py b/examples/models/core/phi/convert_checkpoint.py index d8bc6df8fd..fa59115343 100644 --- a/examples/models/core/phi/convert_checkpoint.py +++ b/examples/models/core/phi/convert_checkpoint.py @@ -81,7 +81,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/qwen/convert_checkpoint.py b/examples/models/core/qwen/convert_checkpoint.py index 225b4989dc..0711e65ff0 100644 --- a/examples/models/core/qwen/convert_checkpoint.py +++ b/examples/models/core/qwen/convert_checkpoint.py @@ -137,7 +137,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/qwen2audio/run.py b/examples/models/core/qwen2audio/run.py index 93e161c7e0..0c72eded66 100644 --- a/examples/models/core/qwen2audio/run.py +++ b/examples/models/core/qwen2audio/run.py @@ -316,7 +316,7 @@ class QWenInfer(object): stream.cuda_stream) stream.synchronize() audio_time = profiler.stop("Audio") / run_time - logger.info(f"TensorRT-LLM Audio latency: {audio_time:3f} sec ") + logger.info(f"TensorRT LLM Audio latency: {audio_time:3f} sec ") assert ok, "Runtime execution failed for audio session" @@ -567,7 +567,7 @@ class QWenInfer(object): print(f'Output(beam: {beam}): "{output_text}"') logger.info(f"Input length={input_lengths[b]}") logger.info(f"Output length={output_ids.shape}") - logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ") + logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ") if isinstance(history, list): history.append({'role': 'assistant', 'content': output_text}) return output_text, past_audio_features diff --git a/examples/models/core/qwenvl/run.py b/examples/models/core/qwenvl/run.py index 06ce341a9a..7013217429 100644 --- a/examples/models/core/qwenvl/run.py +++ b/examples/models/core/qwenvl/run.py @@ -418,7 +418,7 @@ class QWenInfer(object): print(f'Output(beam: {beam}): "{output_text}"') logger.info(f"Input length={input_lengths[b]}") logger.info(f"Output length={output_ids.shape}") - logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ") + logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ") history.append((query, output_text)) return output_text @@ -516,7 +516,7 @@ def vit_process(image_path, vit_engine_path, stream): ok = session_vit.run(visual_inputs, visual_outputs, stream) profiler.stop("ViT") Vit_time = profiler.elapsed_time_in_sec("ViT") / run_time - logger.info(f"TensorRT-LLM ViT latency: {Vit_time:3f} sec ") + logger.info(f"TensorRT LLM ViT latency: {Vit_time:3f} sec ") assert ok, "Runtime execution failed for vit session" diff --git a/examples/models/core/recurrentgemma/convert_checkpoint.py b/examples/models/core/recurrentgemma/convert_checkpoint.py index f7d9bf58c2..2f81bf1d2d 100644 --- a/examples/models/core/recurrentgemma/convert_checkpoint.py +++ b/examples/models/core/recurrentgemma/convert_checkpoint.py @@ -41,7 +41,7 @@ def parse_arguments(): "--output_dir", type=Path, default="recurrentgemma_tllm_checkpoint", - help="The path to save the recurrentgemma TensorRT-LLM checkpoint") + help="The path to save the recurrentgemma TensorRT LLM checkpoint") parser.add_argument("--log_level", type=str, default="info") args = parser.parse_args() return args @@ -506,11 +506,11 @@ def main(): ) trt_llm_config_dict = trt_llm_config.to_dict() - print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}") + print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}") config_path = args.output_dir / "config.json" config_path.parent.mkdir(exist_ok=True, parents=True) - LOGGER.debug(f"Saving TensorRT-LLM configuration to {config_path}") + LOGGER.debug(f"Saving TensorRT LLM configuration to {config_path}") with config_path.open("w") as config_file: json.dump(trt_llm_config_dict, config_file, indent=4) diff --git a/examples/models/core/vit/convert_checkpoint.py b/examples/models/core/vit/convert_checkpoint.py index 5b1759e357..46f8e2b5e1 100644 --- a/examples/models/core/vit/convert_checkpoint.py +++ b/examples/models/core/vit/convert_checkpoint.py @@ -42,7 +42,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--workers', type=int, diff --git a/examples/models/core/whisper/convert_checkpoint.py b/examples/models/core/whisper/convert_checkpoint.py index bd9bc1f44f..28dd4e9fac 100644 --- a/examples/models/core/whisper/convert_checkpoint.py +++ b/examples/models/core/whisper/convert_checkpoint.py @@ -62,7 +62,7 @@ def parse_arguments(): parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', - help='The path to save the TensorRT-LLM checkpoint') + help='The path to save the TensorRT LLM checkpoint') parser.add_argument( '--use_weight_only', default=False, diff --git a/examples/openai_triton/manual_plugin/plugin.py b/examples/openai_triton/manual_plugin/plugin.py index a7d559b4be..7009caaeb6 100644 --- a/examples/openai_triton/manual_plugin/plugin.py +++ b/examples/openai_triton/manual_plugin/plugin.py @@ -35,7 +35,7 @@ def _load_triton_plugin_lib(): plugin_lib = triton_plugin_dir / 'build/libtrt_llm_custom_plugins.so' handle = ctypes.CDLL(plugin_lib, mode=ctypes.RTLD_GLOBAL) if handle is None: - raise ImportError('TensorRT-LLM Triton Plugin is unavailable') + raise ImportError('TensorRT LLM Triton Plugin is unavailable') handle.initOpenAiTritonPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] handle.initOpenAiTritonPlugins.restype = ctypes.c_bool assert handle.initOpenAiTritonPlugins( diff --git a/examples/redrafter/convert_checkpoint.py b/examples/redrafter/convert_checkpoint.py index f5bffd0873..4c18ca612e 100644 --- a/examples/redrafter/convert_checkpoint.py +++ b/examples/redrafter/convert_checkpoint.py @@ -102,7 +102,7 @@ def parse_arguments(): "--output_dir", type=str, default="tllm_checkpoint", - help="The path to save the TensorRT-LLM checkpoint", + help="The path to save the TensorRT LLM checkpoint", ) parser.add_argument( "--workers", diff --git a/examples/summarize.py b/examples/summarize.py index 273c170001..ffa4377f85 100644 --- a/examples/summarize.py +++ b/examples/summarize.py @@ -403,7 +403,7 @@ def main(args): ], dim=0) curr_ppl = ppl(curr_logits, curr_ids) - logger.debug(f"TensorRT-LLM PPL: {curr_ppl:.3f} | " + logger.debug(f"TensorRT LLM PPL: {curr_ppl:.3f} | " f"Generation length: {curr_gen_len}") ppls[batch_idx].append(curr_ppl) return output_beams_list, output_ids_list, ppls, lengths_info @@ -622,7 +622,7 @@ def main(args): if runtime_rank == 0 and args.eval_task != "eval_context_ppl": logger.info( "---------------------------------------------------------") - logger.info("TensorRT-LLM Generated: ") + logger.info("TensorRT LLM Generated: ") logger.info(f" Input: {datapoint[dataset_input_key]}") logger.info(f"\n Reference: {datapoint[dataset_output_key]}") logger.info(f"\n Output: {output}") @@ -683,7 +683,7 @@ def main(args): logger.debug('-' * 100) logger.debug(f"Input: {datapoint[dataset_input_key]}") - logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') + logger.debug(f'TensorRT LLM Output: {output_tensorrt_llm}') logger.debug(f"Reference: {datapoint[dataset_output_key]}") data_point_idx += max_batch_size @@ -807,17 +807,17 @@ def main(args): if test_trt_llm: np.random.seed(0) # rouge score use sampling to compute the score logger.info( - f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)' + f'TensorRT LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)' ) logger.info( - f'TensorRT-LLM (total output tokens: {total_output_token_count_trt_llm})' + f'TensorRT LLM (total output tokens: {total_output_token_count_trt_llm})' ) logger.info( - f'TensorRT-LLM (tokens per second: {total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm")})' + f'TensorRT LLM (tokens per second: {total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm")})' ) for beam_idx in range(num_sequences): - logger.info(f"TensorRT-LLM beam {beam_idx} result") + logger.info(f"TensorRT LLM beam {beam_idx} result") if args.eval_task != "eval_context_ppl": if args.estimate_accuracy_std_dev: computed_metrics_tensorrt_llm = metric_tensorrt_llm[ @@ -923,7 +923,7 @@ if __name__ == '__main__': type=str, default=None, help="Directory where to save output sentences. 'trtllm.out' for " - "TensorRT-LLM outputs, and 'hf.out' for HF outputs. If None, do not " + "TensorRT LLM outputs, and 'hf.out' for HF outputs. If None, do not " "save outputs.") parser.add_argument( '--rouge_dir', diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 52abdbcb84..105e648dfc 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -236,9 +236,9 @@ def setup_conan(scripts_dir, venv_python): # Create default profile build_run(f'"{venv_conan}" profile detect -f') - # Add the tensorrt-llm remote if it doesn't exist + # Add the TensorRT LLM remote if it doesn't exist build_run( - f'"{venv_conan}" remote add --force tensorrt-llm https://edge.urm.nvidia.com/artifactory/api/conan/sw-tensorrt-llm-conan', + f'"{venv_conan}" remote add --force TensorRT-LLM https://edge.urm.nvidia.com/artifactory/api/conan/sw-tensorrt-llm-conan', stdout=DEVNULL, stderr=DEVNULL) @@ -481,7 +481,7 @@ def main(*, with working_directory(build_dir): if clean or first_build or configure_cmake: build_run( - f"\"{venv_conan}\" install --build=missing --remote=tensorrt-llm --output-folder={build_dir}/conan -s 'build_type={build_type}' {source_dir}" + f"\"{venv_conan}\" install --build=missing --remote=TensorRT-LLM --output-folder={build_dir}/conan -s 'build_type={build_type}' {source_dir}" ) cmake_def_args.append( f"-DCMAKE_TOOLCHAIN_FILE={build_dir}/conan/conan_toolchain.cmake" diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py index f54026a8cb..a9a929853a 100644 --- a/tensorrt_llm/__init__.py +++ b/tensorrt_llm/__init__.py @@ -115,6 +115,6 @@ __all__ = [ _init() -print(f"[TensorRT-LLM] TensorRT-LLM version: {__version__}") +print(f"[TensorRT-LLM] TensorRT LLM version: {__version__}") sys.stdout.flush() diff --git a/tensorrt_llm/_common.py b/tensorrt_llm/_common.py index 6283cd514d..c0d64abb81 100644 --- a/tensorrt_llm/_common.py +++ b/tensorrt_llm/_common.py @@ -54,10 +54,10 @@ def _init(log_level: object = None) -> None: logger.set_level(log_level) if os.getenv("TRT_LLM_NO_LIB_INIT", "0") == "1": - logger.info("Skipping TensorRT-LLM init.") + logger.info("Skipping TensorRT LLM init.") return - logger.info("Starting TensorRT-LLM init.") + logger.info("Starting TensorRT LLM init.") # load plugin lib _load_plugin_lib() @@ -82,7 +82,7 @@ def _init(log_level: object = None) -> None: MpiComm.local_init() - logger.info("TensorRT-LLM inited.") + logger.info("TensorRT LLM inited.") def default_net() -> Network: diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index 6b5f8717f0..4a6c426aa9 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -260,7 +260,7 @@ class AutoTunerStatistics: class AutoTuner: - """AutoTuner for optimizing TensorRT-LLM operations. + """AutoTuner for optimizing TensorRT LLM operations. This class handles automatic performance tuning of tensor operations by profiling different implementations and caching the best performing configurations. diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index bcd006be71..5cc93e38d9 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -109,8 +109,8 @@ class _ExecutorMemoryMonitor(): f"{self._bytes_to_gib(sample.free_gpu_memory_bytes_pre):.2f} / {self._bytes_to_gib(sample.free_gpu_memory_bytes_post):.2f}" ) for sample in self._samples), "", - ("Please refer to the TensorRT-LLM documentation for information on how " - "to control the memory usage through TensorRT-LLM configuration options. " + ("Please refer to the TensorRT LLM documentation for information on how " + "to control the memory usage through TensorRT LLM configuration options. " "Possible options include:"), *(f" {stage.value}: {tuning_knobs[stage]}" for stage in chain((sample.creation_stage diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index d6cce43776..0ed224a68d 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -1104,7 +1104,7 @@ def is_multi_device_enable(): This method evaluates if we are running on multiple GPUs and the flag ENABLE_MULTI_DEVICE is set. So we can avoid broadcast calls on single GPU. Issue: https://github.com/NVIDIA/TensorRT-LLM/issues/5927 - ENABLE_MULTI_DEVICE is true by default when building tensorrt-llm so we need to also check + ENABLE_MULTI_DEVICE is true by default when building TensorRT LLM so we need to also check the number of devices """ return local_mpi_size() > 1 diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py index af86fb2b1e..455f74a64b 100644 --- a/tensorrt_llm/bench/benchmark/low_latency.py +++ b/tensorrt_llm/bench/benchmark/low_latency.py @@ -36,7 +36,7 @@ from tensorrt_llm.sampling_params import SamplingParams @click.command(name="latency") @optgroup.group("Engine run configuration", - help="Runtime settings for executing a TensorRT-LLM engine.") + help="Runtime settings for executing a TensorRT LLM engine.") @optgroup.option( "--engine_dir", type=click.Path(exists=True, @@ -137,7 +137,7 @@ from tensorrt_llm.sampling_params import SamplingParams "Desired concurrency rate (number of requests processing at the same time), <=0 for no concurrency limit.", ) @optgroup.group("Speculative Decode Options", - help="Runtime settings for executing a TensorRT-LLM engine.") + help="Runtime settings for executing a TensorRT LLM engine.") @optgroup.option( "--medusa_choices", type=click.Path(exists=True, diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index a353b2a883..654f821dd9 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -35,7 +35,7 @@ from tensorrt_llm.sampling_params import SamplingParams @click.command(name="throughput") @optgroup.group("Engine run configuration.", - help="Runtime settings for executing a TensorRT-LLM engine.") + help="Runtime settings for executing a TensorRT LLM engine.") @optgroup.option( "--engine_dir", type=click.Path(exists=True, diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py index 904c7985ec..4de393a5ec 100644 --- a/tensorrt_llm/bench/build/build.py +++ b/tensorrt_llm/bench/build/build.py @@ -131,7 +131,7 @@ def apply_build_mode_settings(params): @click.command(name="build") @optgroup.group("Engine Configuration", - help="Configuration of the TensorRT-LLM engine.") + help="Configuration of the TensorRT LLM engine.") @optgroup.option( "--tp_size", "-tp", diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index a4154ee43c..4812ae02f8 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -493,7 +493,7 @@ class ReportUtility: f"Model:\t\t\t{engine['model']}\n" f"Model Path:\t\t{engine['model_path']}\n" f"Engine Directory:\t{engine['engine_dir']}\n" - f"TensorRT-LLM Version:\t{engine['version']}\n" + f"TensorRT LLM Version:\t{engine['version']}\n" f"Dtype:\t\t\t{pretrain_cfg['dtype']}\n" f"KV Cache Dtype:\t\t{pretrain_cfg['quantization']['kv_cache_quant_algo']}\n" f"Quantization:\t\t{pretrain_cfg['quantization']['quant_algo']}\n" @@ -507,7 +507,7 @@ class ReportUtility: "===========================================================\n" f"Model:\t\t\t{engine['model']}\n" f"Model Path:\t\t{engine['model_path']}\n" - f"TensorRT-LLM Version:\t{engine['version']}\n" + f"TensorRT LLM Version:\t{engine['version']}\n" f"Dtype:\t\t\t{engine['dtype']}\n" f"KV Cache Dtype:\t\t{engine['kv_cache_dtype']}\n" f"Quantization:\t\t{engine['quantization']}\n" diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py index 272a865d88..85cba5be90 100644 --- a/tensorrt_llm/builder.py +++ b/tensorrt_llm/builder.py @@ -303,7 +303,7 @@ class Builder(): builder_config) -> bool: ''' For each profile, validate that the named dimensions of different input tensors in this profile all have same range. - TRT will validate the same condition, validate it earlier to make sure the modeling in TensorRT-LLM are correct and + TRT will validate the same condition, validate it earlier to make sure the modeling in TensorRT LLM are correct and makes the error msg more user friendly. ''' valid = True @@ -479,9 +479,9 @@ class Builder(): @dataclass class BuildConfig: - """Configuration class for TensorRT-LLM engine building parameters. + """Configuration class for TensorRT LLM engine building parameters. - This class contains all the configuration parameters needed to build a TensorRT-LLM engine, + This class contains all the configuration parameters needed to build a TensorRT LLM engine, including sequence length limits, batch sizes, optimization settings, and various features. Args: @@ -509,7 +509,7 @@ class BuildConfig: auto_parallel_config (AutoParallelConfig): Configuration for automatic parallelization. Defaults to default AutoParallelConfig. weight_sparsity (bool): Whether to enable weight sparsity optimization. Defaults to False. weight_streaming (bool): Whether to enable weight streaming for large models. Defaults to False. - plugin_config (PluginConfig): Configuration for TensorRT-LLM plugins. Defaults to default PluginConfig. + plugin_config (PluginConfig): Configuration for TensorRT LLM plugins. Defaults to default PluginConfig. use_strip_plan (bool): Whether to use stripped plan for engine building. Defaults to False. max_encoder_input_len (int): Maximum encoder input length for encoder-decoder models. Defaults to 1024. dry_run (bool): Whether to perform a dry run without actually building the engine. Defaults to False. diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py index 9374883a9c..75630e8eb9 100644 --- a/tensorrt_llm/commands/build.py +++ b/tensorrt_llm/commands/build.py @@ -62,26 +62,26 @@ def parse_arguments(): '--checkpoint_dir', type=str, default=None, - help="The directory path that contains TensorRT-LLM checkpoint.") + help="The directory path that contains TensorRT LLM checkpoint.") parser.add_argument( '--model_config', type=str, default=None, - help="The file path that saves TensorRT-LLM checkpoint config.") + help="The file path that saves TensorRT LLM checkpoint config.") parser.add_argument( '--build_config', type=str, default=None, - help="The file path that saves TensorRT-LLM build config.") + help="The file path that saves TensorRT LLM build config.") parser.add_argument( '--model_cls_file', type=str, default=None, - help="The file path that defines customized TensorRT-LLM model.") + help="The file path that defines customized TensorRT LLM model.") parser.add_argument('--model_cls_name', type=str, default=None, - help="The customized TensorRT-LLM model class name.") + help="The customized TensorRT LLM model class name.") parser.add_argument( '--output_dir', type=str, diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 06880bc430..6bc0c691d7 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -590,7 +590,7 @@ class Tensor(object): return id(None) def __repr__(self): - return f"TensorRT-LLM Tensor: {self.name=} {self.dtype=} {self.shape=}" + return f"TensorRT LLM Tensor: {self.name=} {self.dtype=} {self.shape=}" def __xor__(self, b): ''' @@ -604,7 +604,7 @@ class Tensor(object): def _create_tensor(trt_tensor: trt.ITensor, producer: trt.ILayer) -> Tensor: ''' - A helper function to create a TensorRT-LLM Tensor object that encapsulates + A helper function to create a TensorRT LLM Tensor object that encapsulates the connection between the TensorRT tensor (trt.ITensor) and the layer (trt.ILayer) that produces it. @@ -626,7 +626,7 @@ def _create_tensor(trt_tensor: trt.ITensor, producer: trt.ILayer) -> Tensor: The producer. Returns: - The TensorRT-LLM tensor (functional.Tensor) that encapsulates the + The TensorRT LLM tensor (functional.Tensor) that encapsulates the TensorRT tensor and the layer that produces it. The former is accessible through the attribute 'trt_tensor' and the latter using the attribute 'producer'. @@ -2051,8 +2051,8 @@ def expand_dims_like(left: Union[Tensor, int, float], right: Tensor) -> Tensor: return left -# If dim is None, return a 1-D TensorRT-LLM tensor of the size -# If dim is not None, return a 0-D TensorRT-LLM tensor of the dimension size +# If dim is None, return a 1-D TensorRT LLM tensor of the size +# If dim is not None, return a 0-D TensorRT LLM tensor of the dimension size def shape(input: Tensor, dim: Optional[int] = None, cast_to_dtype: Optional[Union[str, trt.DataType]] = None, @@ -3471,7 +3471,7 @@ def softplus(input: Tensor, beta: float, threshold: float) -> Tensor: Parameters: input : Tensor - Input TensorRT-LLM Tensor. + Input TensorRT LLM Tensor. beta : float The parameter for softplus computation. threshold : float diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 4f2fbfb015..e9ac3c1e14 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -732,7 +732,7 @@ class BaseLLM: @append_docstring(TRT_LLM_DOCSTRING) class _TrtLLM(BaseLLM): - """LLM class is the main class for running a LLM model using TensorRT-LLM backend. + """LLM class is the main class for running a LLM model using TensorRT LLM backend. Parameters: """ diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py index ea20f023f4..3989ba78ea 100644 --- a/tensorrt_llm/llmapi/llm_utils.py +++ b/tensorrt_llm/llmapi/llm_utils.py @@ -152,7 +152,7 @@ class ModelLoader: if isinstance(self.llm_args.model, Module): # Build engine from user provided model self._build_pipeline.append( - ("Build TensorRT-LLM engine", + ("Build TensorRT LLM engine", self._build_engine_from_inmemory_model)) return diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index 788ffdd2ca..029a66a63a 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -205,14 +205,14 @@ def get_hf_target_modules(lora_weights, hf_modules): def invert_module_mapping( trtllm_modules_to_hf_modules: Dict[str, Union[str, List[str]]], ) -> Dict[str, str]: - """Invert module mapping from TensorRT-LLM -> HF to HF -> TensorRT-LLM. + """Invert module mapping from TensorRT LLM -> HF to HF -> TensorRT-LLM. Args: - trtllm_modules_to_hf_modules: Mapping from TensorRT-LLM module names to HF module names + trtllm_modules_to_hf_modules: Mapping from TensorRT LLM module names to HF module names (values can be strings or lists of strings) Returns: - Dictionary mapping HF module names to TensorRT-LLM module names + Dictionary mapping HF module names to TensorRT LLM module names """ hf_modules_to_trtllm_modules: Dict[str, str] = {} for k, hf_modules in trtllm_modules_to_hf_modules.items(): diff --git a/tensorrt_llm/models/eagle/model.py b/tensorrt_llm/models/eagle/model.py index 07a9d97843..e6edc7c676 100644 --- a/tensorrt_llm/models/eagle/model.py +++ b/tensorrt_llm/models/eagle/model.py @@ -736,7 +736,7 @@ class EagleForCausalLM(LLaMAForCausalLM): I|1|0|1|0 J|0|1|0|1 Note that we could've stored FG in KV cache and provide only IJ tokens here - with mask for past KV cache, but it is not supported in TensorRT-LLM attention at the moment. + with mask for past KV cache, but it is not supported in TensorRT LLM attention at the moment. Draft2 produces tokens K and L at positions 6 and 7. 7. Resulting outputs are: diff --git a/tensorrt_llm/models/mmdit_sd3/model.py b/tensorrt_llm/models/mmdit_sd3/model.py index 480119bc36..546abbeade 100644 --- a/tensorrt_llm/models/mmdit_sd3/model.py +++ b/tensorrt_llm/models/mmdit_sd3/model.py @@ -599,7 +599,7 @@ class SD3ModelWeightsLoader(ModelWeightsLoader): def translate_to_external_key(self, tllm_key: str, tllm_to_externel_key_dict: dict): - """Convert and load external checkpoint into a TensorRT-LLM model. + """Convert and load external checkpoint into a TensorRT LLM model. """ trtllm_to_hf_name = { r"transformer_blocks.(\d+).ff(\w*).net.1.weight": diff --git a/tensorrt_llm/models/model_weights_loader.py b/tensorrt_llm/models/model_weights_loader.py index 6cbb8993fd..ab05d8565b 100644 --- a/tensorrt_llm/models/model_weights_loader.py +++ b/tensorrt_llm/models/model_weights_loader.py @@ -26,7 +26,7 @@ class ModelWeightsFormat(Enum): class ModelWeightsLoader: - """Convert and load external checkpoint into a TensorRT-LLM model. + """Convert and load external checkpoint into a TensorRT LLM model. Attributes: model_dir : Model directory or in-memory torch model. diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py index 7b2af7af15..a491d172f3 100644 --- a/tensorrt_llm/models/modeling_utils.py +++ b/tensorrt_llm/models/modeling_utils.py @@ -1956,7 +1956,7 @@ def save_config(config: PretrainedConfig, *, output_dir: str, log: bool) -> None: config_path = Path(output_dir) / "config.json" if log: - logger.debug(f"Saving TensorRT-LLM configuration to {config_path}") + logger.debug(f"Saving TensorRT LLM configuration to {config_path}") config_path.parent.mkdir(exist_ok=True, parents=True) config_path.write_text(json.dumps(config.to_dict(), indent=4)) diff --git a/tensorrt_llm/models/qwen/model.py b/tensorrt_llm/models/qwen/model.py index 0eb6e8ac44..7fc8800d14 100644 --- a/tensorrt_llm/models/qwen/model.py +++ b/tensorrt_llm/models/qwen/model.py @@ -478,7 +478,7 @@ class QWenForCausalLM(DecoderModelForCausalLM): logger.debug(f"HuggingFace model: {hf_model}") model = QWenForCausalLM(config) - logger.debug(f"TensorRT-LLM model: {model}") + logger.debug(f"TensorRT LLM model: {model}") if quant_config.quant_algo == QuantAlgo.W4A16_GPTQ: weights = load_weights_from_hf_gptq_model(hf_model, config) diff --git a/tensorrt_llm/models/stdit/model.py b/tensorrt_llm/models/stdit/model.py index 780f5d0790..7e2cc5bdce 100644 --- a/tensorrt_llm/models/stdit/model.py +++ b/tensorrt_llm/models/stdit/model.py @@ -1482,7 +1482,7 @@ class STDiT3ModelWeightsLoader(ModelWeightsLoader): def translate_to_external_key(self, tllm_key: str, tllm_to_externel_key_dict: dict): - """Convert and load external checkpoint into a TensorRT-LLM model. + """Convert and load external checkpoint into a TensorRT LLM model. """ trtllm_to_hf_name = { r"spatial_blocks.(\d+).attn.q_layernorm.weight": diff --git a/tensorrt_llm/models/unet/embeddings.py b/tensorrt_llm/models/unet/embeddings.py index 79ad6ae841..8bfe16a408 100644 --- a/tensorrt_llm/models/unet/embeddings.py +++ b/tensorrt_llm/models/unet/embeddings.py @@ -66,7 +66,7 @@ def get_timestep_embedding(timesteps, else: emb = concat([sin(emb), cos(emb)], dim=1) - #TODO Enable below logic when TensorRT-LLM supports pad feature. + #TODO Enable below logic when TensorRT LLM supports pad feature. # zero pad # if embedding_dim % 2 == 1: # emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) diff --git a/tensorrt_llm/module.py b/tensorrt_llm/module.py index 62d0a1de5d..c67674ec15 100644 --- a/tensorrt_llm/module.py +++ b/tensorrt_llm/module.py @@ -202,7 +202,7 @@ class Module(object): tm = {k: v for k, v in torch_module.named_parameters()} assert sorted(m.keys()) == sorted(tm.keys()), ( - "The parameter names of the tensorrt-llm module must be the same with the torch module" + "The parameter names of the TensorRT LLM module must be the same with the torch module" ) for k, v in self.named_parameters(): diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py index 1c4451221f..d2267d51c9 100644 --- a/tensorrt_llm/plugin/plugin.py +++ b/tensorrt_llm/plugin/plugin.py @@ -53,7 +53,7 @@ def _load_plugin_lib(): handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] handle.initTrtLlmPlugins.restype = ctypes.c_bool except AttributeError as err: - raise ImportError('TensorRT-LLM Plugin is unavailable') from err + raise ImportError('TensorRT LLM Plugin is unavailable') from err try: assert handle.initTrtLlmPlugins( @@ -422,7 +422,7 @@ class PluginConfig(metaclass=PluginConfigMeta): init=False, metadata={ "help": - "Enable TensorRT-LLM managed weights to speed up engine building process." + "Enable TensorRT LLM managed weights to speed up engine building process." }) _use_fused_mlp: bool = field( default=True, diff --git a/tensorrt_llm/serve/scripts/benchmark_dataset.py b/tensorrt_llm/serve/scripts/benchmark_dataset.py index 35d2744aea..7b0a093b26 100644 --- a/tensorrt_llm/serve/scripts/benchmark_dataset.py +++ b/tensorrt_llm/serve/scripts/benchmark_dataset.py @@ -326,7 +326,7 @@ class RandomDataset(BenchmarkDataset): class CustomDataset(BenchmarkDataset): """ - TensorRT-LLM customized dataset implementation. + TensorRT LLM customized dataset implementation. It assumes the dataset to be consist of several lines of json, each line is a minimal OpenAI API format request. Example format of each sample on each line: { diff --git a/tensorrt_llm/serve/scripts/benchmark_serving.py b/tensorrt_llm/serve/scripts/benchmark_serving.py index 303688f001..459c463179 100644 --- a/tensorrt_llm/serve/scripts/benchmark_serving.py +++ b/tensorrt_llm/serve/scripts/benchmark_serving.py @@ -4,7 +4,7 @@ r"""Benchmark online serving throughput. On the server side, run one of the following commands: - TensorRT-LLM OpenAI API server + TensorRT LLM OpenAI API server trtllm-serve On the client side, run: diff --git a/tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl b/tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl index 281da94730..be67eb117a 100644 --- a/tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl +++ b/tensorrt_llm/tools/plugin_gen/templates/functional.py.tpl @@ -21,7 +21,7 @@ def _load_triton_plugin_lib(): plugin_lib = "[[ plugin_lib_path ]]" handle = ctypes.CDLL(plugin_lib, mode=ctypes.RTLD_GLOBAL) if handle is None: - raise ImportError('TensorRT-LLM Triton Plugin is unavailable') + raise ImportError('TensorRT LLM Triton Plugin is unavailable') handle.initLibNvInferPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] handle.initLibNvInferPlugins.restype = ctypes.c_bool assert handle.initLibNvInferPlugins( diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py index e135ddfa01..85bec29c8c 100644 --- a/tests/integration/defs/accuracy/accuracy_core.py +++ b/tests/integration/defs/accuracy/accuracy_core.py @@ -423,7 +423,7 @@ class CliFlowAccuracyTestHarness: self.env = env def convert(self): - print("Converting model to TensorRT-LLM checkpoint...") + print("Converting model to TensorRT LLM checkpoint...") is_prequantized = False for quant_config_file in [ diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index a61a5b8c28..809fe42f03 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -647,7 +647,7 @@ def get_trt_llm_lib_dir(venv): "import tensorrt_llm; print(f'{tensorrt_llm.__path__[0]}/libs')", caller=check_output).strip() - if "TensorRT-LLM version: " in output: + if "TensorRT LLM version: " in output: output = output.split('\n')[-1] return output.strip() diff --git a/tests/integration/defs/examples/test_gemma.py b/tests/integration/defs/examples/test_gemma.py index c0a6cbceaf..c04ea61806 100644 --- a/tests/integration/defs/examples/test_gemma.py +++ b/tests/integration/defs/examples/test_gemma.py @@ -253,7 +253,7 @@ def gemma_1gpu_summary(batch_size, "run gemm test on 1 gpu" skip_fp8_pre_ada(use_fp8=test_case == "fp8_kv_cache") if "smooth_quant" in test_case and "bfloat16" in data_type: - pytest.skip("TensorRT-LLM does not support SmoothQuant with bfloat16.") + pytest.skip("TensorRT LLM does not support SmoothQuant with bfloat16.") if any(params in gemma_model_root for params in ["gemma-7b", "9b", "27b"]) and get_device_memory() < 50000: @@ -349,7 +349,7 @@ def test_llm_gemma_1gpu_mmlu(batch_size, data_type, gemma_model_root, llm_venv, llm_rouge_root, llm_datasets_root, test_case): "run gemm test on 1 gpu" if "smooth_quant" in test_case and "bfloat16" in data_type: - pytest.skip("TensorRT-LLM does not support SmoothQuant with bfloat16.") + pytest.skip("TensorRT LLM does not support SmoothQuant with bfloat16.") ckpt_type = get_ckpt_type(gemma_model_root) ckpt_dir = get_ckpt_dir(gemma_model_root) vocab_file = get_vocab_file(gemma_model_root) diff --git a/tests/integration/defs/perf/build.py b/tests/integration/defs/perf/build.py index d01d007c54..e4d4ca2101 100644 --- a/tests/integration/defs/perf/build.py +++ b/tests/integration/defs/perf/build.py @@ -44,7 +44,7 @@ WEIGHT_STREAMING_DISABLED_VAL = "1.0" def parse_arguments(): - parser = argparse.ArgumentParser(description='Build TensorRT-LLM models.') + parser = argparse.ArgumentParser(description='Build TensorRT LLM models.') parser.add_argument('-m', '--model', type=str, diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index 03456d8d5c..50f3592b47 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Stress test script for inference of model using TensorRT-LLM with PyTorch/TRT backend. +Stress test script for inference of model using TensorRT LLM with PyTorch/TRT backend. This script is used for stress testing inference performance using trtllm-serve and genai-perf. """ import contextlib diff --git a/tests/integration/defs/utils/__init__.py b/tests/integration/defs/utils/__init__.py index 4b60d0c485..059725c3cc 100644 --- a/tests/integration/defs/utils/__init__.py +++ b/tests/integration/defs/utils/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Utility modules for TensorRT-LLM integration tests. +Utility modules for TensorRT LLM integration tests. This package provides various utilities to simplify test development and reduce boilerplate code. diff --git a/tests/unittest/_torch/thop/test_fused_qk_norm_rope.py b/tests/unittest/_torch/thop/test_fused_qk_norm_rope.py index ad76e9705e..437e9de178 100644 --- a/tests/unittest/_torch/thop/test_fused_qk_norm_rope.py +++ b/tests/unittest/_torch/thop/test_fused_qk_norm_rope.py @@ -148,7 +148,7 @@ def test_fused_qk_norm_rope(head_dim, num_heads_group, num_tokens, is_neox, k_weight, base, is_neox, position_ids) output = qkv # This op is inplace - # Compute reference output using TensorRT-LLM modules + # Compute reference output using TensorRT LLM modules ref_output = torch_ref_rms_norm_rope(qkv_copy, num_heads_q, num_heads_k, num_heads_v, head_dim, eps, q_weight, k_weight, base, is_neox, position_ids) diff --git a/tests/unittest/others/test_plugins.py b/tests/unittest/others/test_plugins.py index 6abc69f96a..842d3ad0f3 100644 --- a/tests/unittest/others/test_plugins.py +++ b/tests/unittest/others/test_plugins.py @@ -7,7 +7,7 @@ import tensorrt_llm.plugin as _tlp def test_load_library(): - """Test loading the TensorRT-LLM plugin library.""" + """Test loading the TensorRT LLM plugin library.""" runtime = _trt.Runtime(_trt.Logger(_trt.Logger.WARNING)) _trt.init_libnvinfer_plugins(runtime.logger, namespace=_tlp.TRT_LLM_PLUGIN_NAMESPACE) diff --git a/tests/unittest/tools/test_prepare_dataset.py b/tests/unittest/tools/test_prepare_dataset.py index d34c337e0b..05da19a595 100644 --- a/tests/unittest/tools/test_prepare_dataset.py +++ b/tests/unittest/tools/test_prepare_dataset.py @@ -54,7 +54,7 @@ class TestPrepareDatasetLora: Build the base command for running prepare_dataset.py. Args: - llm_root: Path to the TensorRT-LLM root directory + llm_root: Path to the TensorRT LLM root directory Returns: List[str]: Base command components @@ -116,7 +116,7 @@ class TestPrepareDatasetLora: output. Args: - llm_root: Path to the TensorRT-LLM root directory + llm_root: Path to the TensorRT LLM root directory **kwargs: Keyword arguments for LoRA configuration Returns: diff --git a/tests/unittest/trt/functional/test_gemm_swiglu.py b/tests/unittest/trt/functional/test_gemm_swiglu.py index d264d97093..984c7387cf 100644 --- a/tests/unittest/trt/functional/test_gemm_swiglu.py +++ b/tests/unittest/trt/functional/test_gemm_swiglu.py @@ -82,11 +82,11 @@ class TestGemmSwiglu(unittest.TestCase): net.plugin_config.gemm_swiglu_plugin = dtype with tensorrt_llm.net_guard(net): - # Init TensorRT-LLM tensor for x + # Init TensorRT LLM tensor for x x_tensor = Tensor(name='x', shape=x.shape, dtype=str_dtype_to_trt(dtype)) - # Init TensorRT-LLM tensor for w + # Init TensorRT LLM tensor for w w_tensor = Tensor(name='w', shape=w.shape, dtype=str_dtype_to_trt(dtype)) diff --git a/tests/unittest/trt/functional/test_low_latency_gemm.py b/tests/unittest/trt/functional/test_low_latency_gemm.py index d662fb9213..457d73d3c3 100644 --- a/tests/unittest/trt/functional/test_low_latency_gemm.py +++ b/tests/unittest/trt/functional/test_low_latency_gemm.py @@ -56,11 +56,11 @@ class TestLowLatencyGemm(unittest.TestCase): net = builder.create_network() net.plugin_config.low_latency_gemm_plugin = "fp8" with tensorrt_llm.net_guard(net): - # Init TensorRT-LLM tensor for x + # Init TensorRT LLM tensor for x x_tensor = Tensor(name='x', shape=x.shape, dtype=str_dtype_to_trt('fp8')) - # Init TensorRT-LLM tensor for w + # Init TensorRT LLM tensor for w w_tensor = Tensor(name='w', shape=w.shape, dtype=str_dtype_to_trt('fp8')) diff --git a/tests/unittest/trt/quantization/test_fp8_rowwise_gemm.py b/tests/unittest/trt/quantization/test_fp8_rowwise_gemm.py index 1946e42b26..99c7b94da5 100644 --- a/tests/unittest/trt/quantization/test_fp8_rowwise_gemm.py +++ b/tests/unittest/trt/quantization/test_fp8_rowwise_gemm.py @@ -65,20 +65,20 @@ class TestFp8RowwiseGemm(unittest.TestCase): # Allow fp8_rowwise_gemm_plugin of dtype type network.plugin_config.fp8_rowwise_gemm_plugin = dtype with tensorrt_llm.net_guard(network): - # Init TensorRT-LLM tensor for mat1 + # Init TensorRT LLM tensor for mat1 x = Tensor(name='x', shape=mat1.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt("fp8")) - # Init TensorRT-LLM tensor for mat2 + # Init TensorRT LLM tensor for mat2 y = Tensor(name='y', shape=mat2.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt("fp8")) - # Init TensorRT-LLM tensor for per token scaling + # Init TensorRT LLM tensor for per token scaling scale_a = Tensor( name='scale_a', shape=scale_a_torch.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt("float32")) - # Init TensorRT-LLM tensor for per channel scaling + # Init TensorRT LLM tensor for per channel scaling scale_b = Tensor( name='scale_b', shape=scale_b_torch.shape, @@ -97,7 +97,7 @@ class TestFp8RowwiseGemm(unittest.TestCase): memory_pool_limits={trt.MemoryPoolType.WORKSPACE: 33554432})) assert engine is not None, "Failed to build engine" - # Create TensorRT-LLM session + # Create TensorRT LLM session session = tensorrt_llm.runtime.Session.from_serialized_engine( engine.serialize()) diff --git a/tests/unittest/trt/quantization/test_smooth_quant_gemm.py b/tests/unittest/trt/quantization/test_smooth_quant_gemm.py index fe31c2a6a1..a3f5781cbe 100644 --- a/tests/unittest/trt/quantization/test_smooth_quant_gemm.py +++ b/tests/unittest/trt/quantization/test_smooth_quant_gemm.py @@ -64,17 +64,17 @@ class TestSmoothQuantGemm(unittest.TestCase): if use_plugin: network.plugin_config.smooth_quant_gemm_plugin = dtype with tensorrt_llm.net_guard(network): - # Init TensorRT-LLM tensor for mat1 + # Init TensorRT LLM tensor for mat1 x = Tensor(name='x', shape=mat1.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt("int8")) - # Init TensorRT-LLM tensor for mat2 + # Init TensorRT LLM tensor for mat2 y = Tensor(name='y', shape=mat2.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt("int8")) - # Init TensorRT-LLM tensor for per token scaling + # Init TensorRT LLM tensor for per token scaling scale_a = tensorrt_llm.functional.constant(scale_a_torch.numpy()) - # Init TensorRT-LLM tensor for per channel scaling + # Init TensorRT LLM tensor for per channel scaling scale_b = tensorrt_llm.functional.constant(scale_b_torch.numpy()) # Get output tensor for SQ gemm output = smooth_quant_gemm(x, y, scale_a, scale_b, diff --git a/tests/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py b/tests/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py index f0d3549044..7a97e1ba7a 100644 --- a/tests/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py +++ b/tests/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py @@ -55,39 +55,39 @@ class TestWeightOnlyGroupWiseQuantMatmul(unittest.TestCase): network = builder.create_network() network.plugin_config.weight_only_groupwise_quant_matmul_plugin = dtype with tensorrt_llm.net_guard(network): - # Init TensorRT-LLM tensor for activation + # Init TensorRT LLM tensor for activation activation = Tensor( name='activation', shape=th_activation.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt(dtype)) - # Init TensorRT-LLM tensor for pre_quant_scale + # Init TensorRT LLM tensor for pre_quant_scale pre_quant_scale = Tensor( name='pre_quant_scale', shape=th_pre_quant_scale.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt(dtype)) - # Init TensorRT-LLM tensor for weight + # Init TensorRT LLM tensor for weight weight = Tensor(name='weight', shape=th_weight.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt(dtype)) - # Init TensorRT-LLM tensor for scale + # Init TensorRT LLM tensor for scale scale = Tensor(name='scale', shape=th_scale.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt(dtype)) - # Init TensorRT-LLM tensor for zero + # Init TensorRT LLM tensor for zero if th_zero is not None: zero = Tensor(name='zero', shape=th_zero.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt(dtype)) else: zero = None - # Init TensorRT-LLM tensor for bias + # Init TensorRT LLM tensor for bias if th_bias is not None: bias = Tensor(name='bias', shape=th_bias.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt(dtype)) else: bias = None - # Init TensorRT-LLM tensor for alpha + # Init TensorRT LLM tensor for alpha if th_alpha is not None: alpha = Parameter(th_alpha.cpu().numpy(), shape=th_alpha.shape, diff --git a/tests/unittest/trt/quantization/test_weight_only_quant_matmul.py b/tests/unittest/trt/quantization/test_weight_only_quant_matmul.py index 1dae1b405d..3d9623fb60 100644 --- a/tests/unittest/trt/quantization/test_weight_only_quant_matmul.py +++ b/tests/unittest/trt/quantization/test_weight_only_quant_matmul.py @@ -55,13 +55,13 @@ class TestWeightOnlyQuantMatmul(unittest.TestCase): if use_plugin: network.plugin_config.weight_only_quant_matmul_plugin = dtype with tensorrt_llm.net_guard(network): - # Init TensorRT-LLM tensor for mat1 + # Init TensorRT LLM tensor for mat1 x = Tensor(name='x', shape=mat1.shape, dtype=tensorrt_llm._utils.str_dtype_to_trt(dtype)) - # Init TensorRT-LLM tensor for weight + # Init TensorRT LLM tensor for weight weights = constant(torch_to_numpy(processed_torch_weights)) - # Init TensorRT-LLM tensor for per channel scaling + # Init TensorRT LLM tensor for per channel scaling scale = constant(torch_to_numpy(torch_weight_scales)) # Get output tensor for WOQ Matmul output = weight_only_quant_matmul(x, diff --git a/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py b/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py index 8bd0329500..f79b6954ea 100644 --- a/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py +++ b/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py @@ -316,7 +316,7 @@ if __name__ == '__main__': default=["tensorrt_llm"], action="append", help= - "Specify the name of the TensorRT-LLM model. Can be specified multiple times to use multiple models." + "Specify the name of the TensorRT LLM model. Can be specified multiple times to use multiple models." ) parser.add_argument('-c', '--concurrency',