[None][chroe] Rename TensorRT-LLM to TensorRT LLM for source code. (#7851)

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
This commit is contained in:
Guoming Zhang 2025-09-23 01:05:47 +08:00 committed by GitHub
parent 68b7900a1d
commit 57079cecb3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
148 changed files with 311 additions and 311 deletions

View File

@ -25,7 +25,7 @@ TensorRT LLM
* [08/01] Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)
✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
* [07/26] N-GramSpeculativeDecodingin TensorRTLLM
* [07/26] N-GramSpeculativeDecodingin TensorRT LLM
✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md)
* [06/19] Disaggregated Serving in TensorRT LLM

View File

@ -135,7 +135,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
int main(int argc, char* argv[])
{
cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT.");
cxxopts::Options options("TensorRT LLM C++ Runtime Benchmark", "TensorRT LLM C++ Runtime Benchmark for BERT.");
options.add_options()("h,help", "Print usage");
options.add_options()(
"m,model", "Model name specified for engines.", cxxopts::value<std::string>()->default_value("bert_base"));

View File

@ -1145,7 +1145,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,
int main(int argc, char* argv[])
{
cxxopts::Options options("TensorRT-LLm DisaggServer Benchmark");
cxxopts::Options options("TensorRT LLM DisaggServer Benchmark");
options.add_options()("h,help", "Print usage");
options.add_options()("context_engine_dirs", "Directories that store context engines,separator is a ,",
cxxopts::value<std::vector<std::string>>());

View File

@ -1055,7 +1055,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
int main(int argc, char* argv[])
{
cxxopts::Options options(
"TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models.");
"TensorRT LLM BatchManager Benchmark", "TensorRT LLM BatchManager Benchmark for GPT and GPT-like models.");
options.add_options()("h,help", "Print usage");
options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.",
cxxopts::value<std::string>());

View File

@ -217,7 +217,7 @@ std::vector<std::filesystem::path> getJitIncludeDirs()
}
else
{
TLLM_LOG_WARNING("Failed to find TensorRT-LLM installation, DeepGEMM will be disabled.");
TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled.");
}
}
return includeDirs;

View File

@ -165,7 +165,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
{
void* ret = dllGetSym(handle, name);
TLLM_CHECK_WITH_INFO(ret != nullptr,
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
"built with UCX support, please rebuild in UCX-enabled environment.");
return ret;
};

View File

@ -105,7 +105,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm100(T* D, void const* A, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
break;
}
}
@ -146,15 +146,15 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
occupancy);
break;
case tkc::CutlassTileConfigSM100::Undefined:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
break;
case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
break;
}
}
@ -177,7 +177,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm120(T* D, void const* A, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
break;
}
}
@ -205,16 +205,16 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm120(T* D, void const* A, void const* B,
occupancy);
break;
case tkc::CutlassTileConfigSM120::Undefined:
throw std::runtime_error("[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
break;
case tkc::CutlassTileConfigSM120::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
break;
}
}
@ -257,7 +257,7 @@ size_t dispatchMXFP8xMXFP4GemmClusterShapeSm100(T* D, void const* A, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
break;
}
}
@ -293,15 +293,15 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
occupancy);
break;
case tkc::CutlassTileConfigSM100::Undefined:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
break;
case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
break;
}
}
@ -338,7 +338,7 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
}
}
else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4)
@ -356,13 +356,13 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
}
}
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
}
}

View File

@ -93,7 +93,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
int* occupancy)
{
throw std::runtime_error(
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture.");
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture.");
}
#else
@ -250,7 +250,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
{
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
+ std::to_string(mMaxSmemSize);
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
}
/* // Return workspace size */
if (!A && !B && !D)
@ -261,28 +261,28 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
{
std::string errMsg("Requested workspace size insufficient. Required "
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
}
auto can_implement = gemm.can_implement(args);
if (can_implement != cutlass::Status::kSuccess)
{
std::string errMsg = "MXFP8xMXFP4 Gemm cutlass kernel will fail for params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
}
auto initStatus = gemm.initialize(args, workspace, stream);
if (initStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to initialize cutlass MXFP8xMXFP4 gemm. Error: "
+ std::string(cutlassGetStatusString(initStatus));
throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
}
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());
if (runStatus != cutlass::Status::kSuccess)
{
std::string errMsg
= "Failed to run cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
}
return gemm.get_workspace_size(args);
}

View File

@ -107,7 +107,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
int* occupancy) \
{ \
throw std::runtime_error( \
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
}
#else
@ -268,7 +268,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
{ \
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
+ std::to_string(mMaxSmemSize); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
/* // Return workspace size */ \
if (!A && !B && !D) \
@ -279,28 +279,28 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
{ \
std::string errMsg("Requested workspace size insufficient. Required " \
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto can_implement = gemm.can_implement(args); \
if (can_implement != cutlass::Status::kSuccess) \
{ \
std::string errMsg = "FP4 Gemm cutlass kernel will fail for params. Error: " \
+ std::string(cutlassGetStatusString(can_implement)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto initStatus = gemm.initialize(args, workspace, stream); \
if (initStatus != cutlass::Status::kSuccess) \
{ \
std::string errMsg \
= "Failed to initialize cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(initStatus)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); \
if (runStatus != cutlass::Status::kSuccess) \
{ \
std::string errMsg \
= "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
return gemm.get_workspace_size(args); \
}

View File

@ -69,7 +69,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
int* occupancy) \
{ \
throw std::runtime_error( \
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
}
#else
@ -224,7 +224,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
{ \
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
+ std::to_string(mMaxSmemSize); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
/* // Return workspace size */ \
if (!A && !B && !D) \
@ -235,7 +235,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
{ \
std::string errMsg("Requested workspace size insufficient. Required " \
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto initStatus = gemm.initialize(args, workspace); \
if (initStatus != cutlass::Status::kSuccess) \
@ -243,14 +243,14 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
auto cudaErrMsg = cudaGetErrorString(cudaGetLastError()); \
std::string errMsg = "Failed to initialize cutlass FP4 gemm. Error: " \
+ std::string(cutlass::cutlassGetStatusString(initStatus)) + " " + cudaErrMsg; \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); \
if (runStatus != cutlass::Status::kSuccess) \
{ \
std::string errMsg \
= "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlass::cutlassGetStatusString(runStatus)); \
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
} \
return gemm.get_workspace_size(args); \
}

View File

@ -75,7 +75,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
{
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
+ std::to_string(mMaxSmemSize);
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
}
// Return workspace size
@ -88,7 +88,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
{
std::string errMsg("Requested workspace size insufficient. Required "
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
}
auto can_implement = gemm.can_implement(args);
@ -96,21 +96,21 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
{
std::string errMsg = "fp8RowwiseGemm cutlass kernel not implemented given the params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
}
auto initStatus = gemm.initialize(args, workspace, stream);
if (initStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
}
auto runStatus = gemm.run(stream);
if (runStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
}
return gemm.get_workspace_size(args);
}
@ -210,7 +210,7 @@ size_t dispatchGemmConfigSm89(void* D, void const* A, void const* B, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for "
"Fp8 Rowwise GEMM.");
break;
}
@ -299,16 +299,16 @@ size_t dispatchGemmToCutlassSm89(void* D, void const* A, void const* B, void con
case tkc::CutlassTileConfig::Undefined:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined.");
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined.");
break;
case tkc::CutlassTileConfig::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have "
"already been set by heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for "
"Fp8 Rowwise GEMM.");
break;
}
@ -379,7 +379,7 @@ size_t genericFp8RowwiseGemmKernelLauncherSm90(void* D, void const* A, void cons
Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
#else // COMPILE_HOPPER_TMA_GEMMS
throw std::runtime_error(
"[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
"[TensorRT LLM Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
"90-real as an arch to build_wheel.py.");
#endif // COMPILE_HOPPER_TMA_GEMMS
}
@ -418,7 +418,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for "
"Fp8 Rowwise GEMM.");
break;
}
@ -468,16 +468,16 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con
break;
case tkc::CutlassTileConfigSM90::Undefined:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
break;
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
"already been set by heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for "
"Fp8 Rowwise GEMM.");
break;
}
@ -517,7 +517,7 @@ size_t CutlassFp8RowwiseGemmRunner<T>::dispatchToArch(void* D, void const* A, vo
#endif
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
"Fp8 Rowwise GEMM");
}
return 0;
@ -585,7 +585,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFp8RowwiseGemmRunner<T>::getConfigs()
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
"Fp8 Rowwise GEMM");
}
return candidateConfigs;

View File

@ -209,7 +209,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
{
std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
}
auto init_status = gemm.initialize(args, workspace, stream);
@ -217,7 +217,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
{
std::string err_msg
= "Failed to initialize cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(init_status));
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
}
auto run_status = gemm.run(stream);
@ -225,7 +225,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
{
std::string err_msg
= "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status));
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
}
}
@ -247,14 +247,14 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca
// Multistage only supported on Ampere
std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
+ std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
}
else if constexpr (Stages == 2 && arch::kMinComputeCapability >= 89)
{
// Multistage only supported on Ampere
std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
+ std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
}
else if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value
&& arch::kMinComputeCapability < 89)
@ -262,7 +262,7 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca
// FP8 activation type only supported on Ada+ GPUs
std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
+ std::to_string(arch::kMinComputeCapability) + " with activation type set to FP8";
throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
}
else
{
@ -301,7 +301,7 @@ void dispatch_gemm_config(ActivationType const* A, WeightType const* B, ScaleZer
break;
default:
std::string err_msg = "dispatch_gemm_config does not support stages " + std::to_string(gemm_config.stages);
throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_config] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + err_msg);
break;
}
}
@ -370,16 +370,16 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal
C, m, n, k, group_size, gemm_config, workspace, workspace_bytes, stream, occupancy);
break;
case tkc::CutlassTileConfig::Undefined:
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
break;
case tkc::CutlassTileConfig::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by "
"[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
"[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
break;
}
}
@ -387,7 +387,7 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal
{
// This is not a limitation in CUTLASS. We just do not need to support this case.
std::string err_msg = "The activation type must equal the scale, bias and output types on Ampere and earlier.";
throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_to_cutlass] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_to_cutlass] " + err_msg);
}
}
@ -439,7 +439,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value)
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs "
"[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs "
"CUDA>=12.4");
}
#endif
@ -459,7 +459,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for CUTLASS mixed type "
"[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for CUTLASS mixed type "
"GEMM");
}
}

View File

@ -62,7 +62,7 @@ void sm90_dispatch_epilogue_schedules(ActivationType const* A, WeightType const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule config is invalid for "
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule config is invalid for "
"mixed "
"type GEMM.");
break;
@ -135,7 +135,7 @@ void sm90_dispatch_mainloop_schedules(ActivationType const* A, WeightType const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule config is invalid "
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule config is invalid "
"for "
"mixed type GEMM.");
break;
@ -144,7 +144,7 @@ void sm90_dispatch_mainloop_schedules(ActivationType const* A, WeightType const*
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and Cluster shapes for "
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and Cluster shapes for "
"mixed type GEMM.");
}
}
@ -181,7 +181,7 @@ void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B, Sca
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type GEMM.");
"[TensorRT LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type GEMM.");
break;
}
}
@ -254,16 +254,16 @@ void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
break;
case tkc::CutlassTileConfigSM90::Undefined:
throw std::runtime_error(
"[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
break;
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have already been set by "
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
break;
}
}

View File

@ -193,7 +193,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
if (group_size % cta_shape_k != 0)
{
std::string err_msg = "The group size must a multiple of " + std::to_string(cta_shape_k);
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner]" + err_msg);
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner]" + err_msg);
}
if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)
@ -249,7 +249,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
Gemm gemm;
if (gemm.get_workspace_size(args) > workspace_bytes)
{
TLLM_LOG_ERROR("[TensorRT-LLm Error][fpA_intB Runner] given workspace size insufficient.");
TLLM_LOG_ERROR("[TensorRT LLM Error][fpA_intB Runner] given workspace size insufficient.");
}
auto can_implement = gemm.can_implement(args);
@ -258,7 +258,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
std::cout << err_msg << std::endl;
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
}
auto init_status = gemm.initialize(args, workspace, stream);
@ -266,7 +266,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
{
std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: "
+ std::string(cutlassGetStatusString(init_status));
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
}
auto run_status = gemm.run(stream);
@ -274,13 +274,13 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
{
std::string err_msg
= "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status));
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
}
}
else
{
std::stringstream ss;
ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << ","
ss << "[TensorRT LLM Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << ","
<< (int64_t) cute::size<1>(CTAShape{}) << "," << (int64_t) cute::size<2>(CTAShape{}) << ") ("
<< (int64_t) cute::size<0>(ClusterShape{}) << "," << (int64_t) cute::size<1>(ClusterShape{}) << ","
<< (int64_t) cute::size<2>(ClusterShape{}) << ") not compiled with FAST_BUILD.";
@ -290,7 +290,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
#else // COMPILE_HOPPER_TMA_GEMMS
throw std::runtime_error(
"[TensorRT-LLm Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch "
"[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch "
"to build_wheel.py.");
#endif // COMPILE_HOPPER_TMA_GEMMS
}

View File

@ -67,7 +67,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
{
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
+ std::to_string(mMaxSmemSize);
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
}
// Return workspace size
@ -80,7 +80,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
{
std::string errMsg("Requested workspace size insufficient. Required "
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
}
auto can_implement = gemm.can_implement(args);
@ -88,21 +88,21 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
{
std::string errMsg = "fusedGatedGemm cutlass kernel not implemented given the params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
}
auto initStatus = gemm.initialize(args, workspace, stream);
if (initStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
}
auto runStatus = gemm.run(stream);
if (runStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
}
return gemm.get_workspace_size(args);
}
@ -165,7 +165,7 @@ size_t genericGemmGatedKernelLauncherSm90(void* D, void const* A, void const* B,
return typedGemmGatedKernelLauncher(Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
#else // COMPILE_HOPPER_TMA_GEMMS
throw std::runtime_error(
"[TensorRT-LLm Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real "
"[TensorRT LLM Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real "
"as an arch to build_wheel.py.");
#endif // COMPILE_HOPPER_TMA_GEMMS
}
@ -204,7 +204,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const*
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused "
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused "
"gated GEMM.");
break;
}
@ -255,17 +255,17 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con
break;
case tkc::CutlassTileConfigSM90::Undefined:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
break;
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
"already been set by "
"heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused "
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused "
"gated GEMM.");
break;
}
@ -302,14 +302,14 @@ size_t CutlassFusedGatedGemmRunner<T>::dispatchToArch(void* D, void const* A, vo
#endif
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
"gated GEMM");
}
}
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
"gated "
"GEMM");
}
@ -340,7 +340,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFusedGatedGemmRunner<T>::getConfigs()
if (mSm != 90)
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
"gated GEMM");
}
tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param
@ -378,7 +378,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFusedGatedGemmRunner<T>::getConfigs()
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
"gated "
"GEMM");
}

View File

@ -150,7 +150,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
{
std::string errMsg = "int8gemm cutlass kernel will fail for params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
}
auto initStatus = gemm.initialize(args, workspace, stream);
@ -158,7 +158,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
{
std::string errMsg
= "Failed to initialize cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(initStatus));
throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
}
auto runStatus = gemm.run(stream);
@ -166,7 +166,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
{
std::string errMsg
= "Failed to run cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
}
}
@ -180,7 +180,7 @@ struct dispatchStages
TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
std::string errMsg = "Cutlass int8 gemm. Not instantiates for arch "
+ std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
throw std::runtime_error("[TensorRT-LLM Error][dispatchStages::dispatch] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][dispatchStages::dispatch] " + errMsg);
}
};
@ -248,7 +248,7 @@ void dispatchGemmConfig(int8_t const* A, int8_t const* B, tk::QuantMode quantOpt
break;
default:
std::string errMsg = "dispatchGemmConfig does not support stages " + std::to_string(gemmConfig.stages);
throw std::runtime_error("[TensorRT-LLM Error][dispatch_gemm_config] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + errMsg);
break;
}
}
@ -288,16 +288,16 @@ void dispatchGemmToCutlass(int8_t const* A, int8_t const* B, tk::QuantMode quant
quantOption, alphaCol, alphaRow, C, m, n, k, gemmConfig, workspace, workspaceBytes, stream, occupancy);
break;
case tkc::CutlassTileConfig::Undefined:
throw std::runtime_error("[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined.");
throw std::runtime_error("[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined.");
break;
case tkc::CutlassTileConfig::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by "
"[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by "
"heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM.");
"[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM.");
break;
}
}
@ -342,7 +342,7 @@ void CutlassInt8GemmRunner<T>::dispatchToArch(int8_t const* A, int8_t const* B,
else
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
"[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
}
}
@ -364,7 +364,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassInt8GemmRunner<T>::getConfigs() const
if (mSm <= 70)
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
"[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
}
std::vector<tkc::CutlassGemmConfig> candidateConfigs = get_candidate_configs(mSm, SPLIT_K_LIMIT, config_type_param);

View File

@ -195,7 +195,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
{
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
+ std::to_string(mMaxSmemSize);
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
}
// Return workspace size
@ -208,7 +208,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
{
std::string errMsg("Requested workspace size insufficient. Required "
+ std::to_string(gemm.get_workspace_size(arguments)) + ", got " + std::to_string(workspaceBytes));
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
}
auto can_implement = gemm.can_implement(arguments);
@ -216,26 +216,26 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
{
std::string errMsg = "Fp8LowLatencyGemm cutlass kernel not implemented given the params. Error: "
+ std::string(cutlassGetStatusString(can_implement));
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
}
auto initStatus = gemm.initialize(arguments, workspacePtr);
if (initStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
}
auto runStatus = gemm.run(stream, nullptr, pdl_overlap_ratio >= 0);
if (runStatus != cutlass::Status::kSuccess)
{
std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
}
return gemm.get_workspace_size(arguments);
#else // COMPILE_HOPPER_TMA_GEMMS
throw std::runtime_error(
"[TensorRT-LLm Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by "
"[TensorRT LLM Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by "
"passing 90-real as an arch to build_wheel.py.");
#endif // COMPILE_HOPPER_TMA_GEMMS
}
@ -264,7 +264,7 @@ size_t dispatchLowLatencyGemmCultassKernelSchedSm90(__nv_fp8_e4m3 const* A, __nv
break;
default:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config "
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config "
"is "
"invalid for low latency fp8 gemm");
break;
@ -300,7 +300,7 @@ size_t dispatchLowLatencyGemmClusterShapeSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e
default:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is "
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is "
"invalid for low latency fp8 gemm");
break;
}
@ -369,19 +369,19 @@ size_t dispatchLowLatencyGemmToCutlassSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e4m3
break;
case tkc::CutlassTileConfigSM90::Undefined:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
"undefined.");
break;
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
"should have "
"already been set by "
"heuristic.");
break;
default:
throw std::runtime_error(
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is "
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is "
"invalid for low latency fp8 gemm");
break;
}
@ -413,7 +413,7 @@ size_t CutlassLowLatencyFp8GemmRunner<T>::dispatchToArch(__nv_fp8_e4m3 const* A,
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low "
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low "
"Latency Gemm");
}
return 0;
@ -499,7 +499,7 @@ std::vector<ConfigType> CutlassLowLatencyFp8GemmRunner<T>::getConfigs() const
if (mSm != 90)
{
throw std::runtime_error(
"[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low "
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low "
"Latency GEMM");
}
tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param

View File

@ -235,12 +235,12 @@ struct BatchedGemmData
void const* mPtrBias{nullptr};
// The output tensor scaling factor for MxFp{4,8}, Fp8 and NvFp4 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [B].
float const* mPtrScaleC{nullptr};
// The output gate scale for MxFp{4,8} and NvFp4 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [B].
float const* mPtrScaleGate{nullptr};

View File

@ -214,12 +214,12 @@ struct KernelParams
// ScaleC = SEncC
//
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [B]. One scaling factor per tensor in batch.
float const* ptrScaleC{nullptr};
// The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [B]. One scaling factor per tensor in batch.
float const* ptrScaleGate{nullptr};

View File

@ -143,7 +143,7 @@ struct GemmData
void const* mPtrPerTokenSfB{nullptr};
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [1].
void* mPtrScaleC{nullptr};
};

View File

@ -204,7 +204,7 @@ struct KernelParams
void* ptrSfC;
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [1].
float const* ptrScaleC;

View File

@ -133,11 +133,11 @@ struct GemmGatedActData
void const* mPtrPerTokenSfB{nullptr};
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [1].
void const* mPtrScaleC{nullptr};
// The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [1].
void const* mPtrScaleGate{nullptr};
};

View File

@ -290,7 +290,7 @@ struct KernelParams
// y = act(ptrScaleGate[0] * y1) * (ptrScaleC[0] * y2)
//
// The output tensor scaling factor for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
// TensorRT-LLM API requires a scaling factor on the device.
// TensorRT LLM API requires a scaling factor on the device.
// Shape is [1].
float const* ptrScaleC;
// The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.

View File

@ -73,7 +73,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
NB_MODULE(TRTLLM_NB_MODULE, m)
{
m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
m.doc() = "TensorRT LLM Python bindings for C++ runtime";
m.attr("binding_type") = "nanobind";
nb::set_leak_warnings(false);

View File

@ -125,7 +125,7 @@ BertAttentionPlugin::BertAttentionPlugin(void const* data, size_t length)
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -48,7 +48,7 @@ CudaStreamPlugin::CudaStreamPlugin(void const* data, size_t length)
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -58,7 +58,7 @@ EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin(void const* data, siz
read(d, mTopKSampling);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
static_cast<int>(length), static_cast<int>(d - a));
}

View File

@ -52,7 +52,7 @@ EagleSampleAndAcceptDraftTokensPlugin::EagleSampleAndAcceptDraftTokensPlugin(voi
read(d, mDtype);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -47,7 +47,7 @@ FusedLayernormPlugin::FusedLayernormPlugin(void const* data, size_t length)
read(d, mType);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -203,7 +203,7 @@ static GemmAllReducePluginOptions deserializeOptions(void const*& data, size_t l
TLLM_CHECK_WITH_INFO(end == begin + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (end - begin));

View File

@ -179,7 +179,7 @@ GemmPlugin::GemmPlugin(void const* data, size_t length, GemmPlugin::PluginProfil
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -183,7 +183,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng
}
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
TLLM_CHECK_WITH_INFO((smVersion() >= 80) || (mType != nvinfer1::DataType::kBF16),

View File

@ -35,7 +35,7 @@ IdentityPlugin::IdentityPlugin(void const* data, size_t length)
char const *d = reinterpret_cast<char const*>(data), *a = d;
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -61,7 +61,7 @@ LayernormQuantizationPlugin::LayernormQuantizationPlugin(void const* data, size_
read(d, mOutputType);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -48,7 +48,7 @@ LookupPlugin::LookupPlugin(void const* data, size_t length)
read(d, mRank);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -78,7 +78,7 @@ LoraPlugin::LoraPlugin(void const* data, size_t length, LoraPlugin::PluginProfil
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -124,7 +124,7 @@ LowLatencyGemmPlugin::LowLatencyGemmPlugin(void const* data, size_t length, Plug
mPluginProfiler->deserialize(d, mDims, mGemmId);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -159,7 +159,7 @@ LowLatencyGemmSwigluPlugin::LowLatencyGemmSwigluPlugin(
mPluginProfiler->deserialize(d, mDims, mGemmId);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -175,7 +175,7 @@ MixtureOfExpertsPlugin::MixtureOfExpertsPlugin(void const* data, size_t length,
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -48,7 +48,7 @@ AllgatherPlugin::AllgatherPlugin(void const* data, size_t length)
}
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -77,7 +77,7 @@ AllreducePlugin::AllreducePlugin(void const* data, size_t length)
}
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
check();

View File

@ -45,7 +45,7 @@ RecvPlugin::RecvPlugin(void const* data, size_t length)
read(d, mSrcRank);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -48,7 +48,7 @@ ReduceScatterPlugin::ReduceScatterPlugin(void const* data, size_t length)
}
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -46,7 +46,7 @@ SendPlugin::SendPlugin(void const* data, size_t length)
read(d, mTgtRank);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -64,7 +64,7 @@ QServeGemmPlugin::QServeGemmPlugin(void const* data, size_t length)
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -51,7 +51,7 @@ QuantizePerTokenPlugin::QuantizePerTokenPlugin(void const* data, size_t length)
read(d, mSumPerToken);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -35,7 +35,7 @@ QuantizeTensorPlugin::QuantizeTensorPlugin(void const* data, size_t length)
char const *d = reinterpret_cast<char const*>(data), *a = d;
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -41,7 +41,7 @@ QuantizeToFP4Plugin::QuantizeToFP4Plugin(void const* data, size_t length)
char const *d = reinterpret_cast<char const*>(data), *a = d;
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -58,7 +58,7 @@ RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(void const* data, size_t le
read(d, mOutputType);
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -98,7 +98,7 @@ SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -148,7 +148,7 @@ WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -126,7 +126,7 @@ WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(
TLLM_CHECK_WITH_INFO(d == a + length,
"Expected length (%d) != real length (%d). This is often "
"caused by using different TensorRT-LLM version to build "
"caused by using different TensorRT LLM version to build "
"engine and run engine.",
(int) length, (int) (d - a));
}

View File

@ -67,7 +67,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
{
m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
m.doc() = "TensorRT LLM Python bindings for C++ runtime";
m.attr("binding_type") = "pybind";
// Create MpiComm binding first since it's used in the executor bindings

View File

@ -56,7 +56,7 @@ public:
}
/// @brief If multiple TensorRT optimization profiles are built in the engine, this function selects the
/// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT-LLM only split
/// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT LLM only split
/// multiple profiles on the num_tokens dimension, hence the profile index is selected based on which profile
/// handles the actual num_tokens
/// @return The index of the selected TensorRT optimization profile

View File

@ -330,7 +330,7 @@ protected:
{
void* ret = dllGetSym(handle, name);
TLLM_CHECK_WITH_INFO(ret != nullptr,
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
"built with UCX support, please rebuild in UCX-enabled environment.");
return ret;
};
@ -732,7 +732,7 @@ protected:
{
void* ret = dllGetSym(handle, name);
TLLM_CHECK_WITH_INFO(ret != nullptr,
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
"built with UCX support, please rebuild in UCX-enabled environment.");
return ret;
};

View File

@ -70,7 +70,7 @@ std::unique_ptr<texec::kv_cache::ConnectionManager> makeOneUcxConnectionManager(
void* ret = dllGetSym(handle, name);
TLLM_CHECK_WITH_INFO(ret != nullptr,
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
"built with UCX support, please rebuild in UCX-enabled environment.");
return ret;
};

View File

@ -243,7 +243,7 @@ Result run(std::string description, Options& options, Buffers& buffers)
auto can_implement = device_gemm.can_implement(arguments);
if (can_implement != cutlass::Status::kSuccess)
{
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner]");
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner]");
}
// Initialize CUTLASS kernel with arguments and workspace pointer
@ -481,7 +481,7 @@ int main(int argc, char const** argv)
#ifdef COMPILE_HOPPER_TMA_GEMMS
Result hopperFp8 = run<Gemm>(std::string("Hopper fp8 swiglu"), options, buffers);
#else // COMPILE_HOPPER_TMA_GEMMS
std::cout << "[TensorRT-LLm Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by "
std::cout << "[TensorRT LLM Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by "
"passing 90-real as an arch to build_wheel.py."
<< std::endl;
#endif // COMPILE_HOPPER_TMA_GEMMS

View File

@ -338,7 +338,7 @@ TEST(GemmSwigluRunner, Sm90FP8)
Result hopperFp8 = run("SM90 FP8 WS GEMM", options, buffers);
EXPECT_TRUE(hopperFp8.passed);
#else // COMPILE_HOPPER_TMA_GEMMS
std::cout << "[TensorRT-LLm Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing "
std::cout << "[TensorRT LLM Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing "
"90-real as an arch to build_wheel.py."
<< std::endl;
#endif // COMPILE_HOPPER_TMA_GEMMS

View File

@ -1,6 +1,6 @@
"""
NOTE: This FastAPI-based server is only an example for demonstrating the usage
of TensorRT-LLM LLM API. It is not intended for production use.
of TensorRT LLM LLM API. It is not intended for production use.
For production, use the `trtllm-serve` command. The server exposes OpenAI compatible API endpoints.
"""

View File

@ -28,11 +28,11 @@ int main(int argc, char* argv[])
void log(nvinfer1::ILogger::Severity severity, char const* msg) noexcept override
{
if (severity <= nvinfer1::ILogger::Severity::kERROR)
std::cerr << "[TensorRT-LLM ERR]: " << msg << std::endl;
std::cerr << "[TensorRT LLM ERR]: " << msg << std::endl;
else if (severity == nvinfer1::ILogger::Severity::kWARNING)
std::cerr << "[TensorRT-LLM WARNING]: " << msg << std::endl;
std::cerr << "[TensorRT LLM WARNING]: " << msg << std::endl;
else
std::cout << "[TensorRT-LLM LOG]: " << msg << std::endl;
std::cout << "[TensorRT LLM LOG]: " << msg << std::endl;
}
};

View File

@ -144,7 +144,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -12,7 +12,7 @@ def parse_arguments():
'--output_path',
type=str,
default='config.json',
help='The path to save the TensorRT-LLM checkpoint config.json file')
help='The path to save the TensorRT LLM checkpoint config.json file')
parser.add_argument('--architecture', type=str, default='GPTForCausalLM')
parser.add_argument('--dtype',
type=str,

View File

@ -29,7 +29,7 @@
# MOUNT_DIR: the directory to mount in the container
# MOUNT_DEST: the destination directory in the container
# WORKDIR: the working directory in the container
# SOURCE_ROOT: the path to the TensorRT-LLM source
# SOURCE_ROOT: the path to the TensorRT LLM source
# PROLOGUE: the prologue to run before the script
# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
# not supported in Slurm mode, you need to download the model and put it in

View File

@ -29,7 +29,7 @@
# MOUNT_DIR: the directory to mount in the container
# MOUNT_DEST: the destination directory in the container
# WORKDIR: the working directory in the container
# SOURCE_ROOT: the path to the TensorRT-LLM source
# SOURCE_ROOT: the path to the TensorRT LLM source
# PROLOGUE: the prologue to run before the script
# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
# not supported in Slurm mode, you need to download the model and put it in

View File

@ -29,7 +29,7 @@
# MOUNT_DIR: the directory to mount in the container
# MOUNT_DEST: the destination directory in the container
# WORKDIR: the working directory in the container
# SOURCE_ROOT: the path to the TensorRT-LLM source
# SOURCE_ROOT: the path to the TensorRT LLM source
# PROLOGUE: the prologue to run before the script
# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
# not supported in Slurm mode, you need to download the model and put it in

View File

@ -161,7 +161,7 @@ def demonstrate_with_logprobs(prompt: str):
def run_all_demonstrations(model_path: Optional[str] = None):
"""Run all sampling demonstrations."""
print("🚀 TensorRT-LLM Sampling Techniques Showcase")
print("🚀 TensorRT LLM Sampling Techniques Showcase")
print("=" * 50)
# Use the first prompt for most demonstrations

View File

@ -161,7 +161,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -53,7 +53,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -156,7 +156,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=Path,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--calib_dataset',
type=str,

View File

@ -190,7 +190,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -90,7 +90,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -79,7 +79,7 @@ def parse_arguments():
type=str,
default='trtllm_checkpoint',
required=True,
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -79,7 +79,7 @@ def parse_arguments():
type=str,
default='trtllm_checkpoint',
required=True,
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -87,7 +87,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument('--input_size',
type=int,
default=64,

View File

@ -74,7 +74,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -61,7 +61,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -76,7 +76,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -110,7 +110,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -37,7 +37,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -124,7 +124,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -76,7 +76,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -44,7 +44,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument('--caption_channels',
type=int,
default=4096,

View File

@ -47,7 +47,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -79,7 +79,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -260,7 +260,7 @@ def main() -> None:
trt_llm_config.query_pre_attn_scalar = ckpt_config.query_pre_attn_scalar
trt_llm_config_dict = trt_llm_config.to_dict()
print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}")
print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}")
save_config(trt_llm_config, output_dir=args.output_model_dir, log=True)

View File

@ -127,7 +127,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -132,7 +132,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -71,7 +71,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -227,7 +227,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -51,7 +51,7 @@ def parse_args():
'--max_input_len',
type=int,
default=6400,
help='The max input length TensorRT-LLM engine was built with')
help='The max input length TensorRT LLM engine was built with')
parser.add_argument('--log_level', type=str, default='info')
parser.add_argument('--max_ite', type=int, default=5)
parser.add_argument(
@ -392,7 +392,7 @@ def main(args):
references=[hf_summary[ite][beam_idx][batch_idx]])
for beam_idx in range(args.num_beams):
logger.info(f"TensorRT-LLM beam {beam_idx} result")
logger.info(f"TensorRT LLM beam {beam_idx} result")
computed_metrics_tensorrt_llm = metric_tensorrt_llm[
beam_idx].compute()
for key in computed_metrics_tensorrt_llm.keys():

View File

@ -59,7 +59,7 @@ def parse_arguments():
'--output_dir',
type=Path,
default='mamba_tllm_checkpoint',
help='The path to save the mamba TensorRT-LLM checkpoint')
help='The path to save the mamba TensorRT LLM checkpoint')
parser.add_argument('--log_level', type=str, default='info')
parser.add_argument(
'--workers',

View File

@ -192,7 +192,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -132,11 +132,11 @@ def load_hf_model(args):
def load_trtllm_model(args):
profiler.start('load TensorRT-LLM model')
profiler.start('load TensorRT LLM model')
trtllm_model = MultimodalModelRunner(args)
profiler.stop('load TensorRT-LLM model')
profiler.stop('load TensorRT LLM model')
logger.info(
f'Load TensorRT-LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT-LLM model")} sec'
f'Load TensorRT LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT LLM model")} sec'
)
return trtllm_model

View File

@ -56,7 +56,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -81,7 +81,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -137,7 +137,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

View File

@ -316,7 +316,7 @@ class QWenInfer(object):
stream.cuda_stream)
stream.synchronize()
audio_time = profiler.stop("Audio") / run_time
logger.info(f"TensorRT-LLM Audio latency: {audio_time:3f} sec ")
logger.info(f"TensorRT LLM Audio latency: {audio_time:3f} sec ")
assert ok, "Runtime execution failed for audio session"
@ -567,7 +567,7 @@ class QWenInfer(object):
print(f'Output(beam: {beam}): "{output_text}"')
logger.info(f"Input length={input_lengths[b]}")
logger.info(f"Output length={output_ids.shape}")
logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ")
logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ")
if isinstance(history, list):
history.append({'role': 'assistant', 'content': output_text})
return output_text, past_audio_features

View File

@ -418,7 +418,7 @@ class QWenInfer(object):
print(f'Output(beam: {beam}): "{output_text}"')
logger.info(f"Input length={input_lengths[b]}")
logger.info(f"Output length={output_ids.shape}")
logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ")
logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ")
history.append((query, output_text))
return output_text
@ -516,7 +516,7 @@ def vit_process(image_path, vit_engine_path, stream):
ok = session_vit.run(visual_inputs, visual_outputs, stream)
profiler.stop("ViT")
Vit_time = profiler.elapsed_time_in_sec("ViT") / run_time
logger.info(f"TensorRT-LLM ViT latency: {Vit_time:3f} sec ")
logger.info(f"TensorRT LLM ViT latency: {Vit_time:3f} sec ")
assert ok, "Runtime execution failed for vit session"

View File

@ -41,7 +41,7 @@ def parse_arguments():
"--output_dir",
type=Path,
default="recurrentgemma_tllm_checkpoint",
help="The path to save the recurrentgemma TensorRT-LLM checkpoint")
help="The path to save the recurrentgemma TensorRT LLM checkpoint")
parser.add_argument("--log_level", type=str, default="info")
args = parser.parse_args()
return args
@ -506,11 +506,11 @@ def main():
)
trt_llm_config_dict = trt_llm_config.to_dict()
print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}")
print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}")
config_path = args.output_dir / "config.json"
config_path.parent.mkdir(exist_ok=True, parents=True)
LOGGER.debug(f"Saving TensorRT-LLM configuration to {config_path}")
LOGGER.debug(f"Saving TensorRT LLM configuration to {config_path}")
with config_path.open("w") as config_file:
json.dump(trt_llm_config_dict, config_file, indent=4)

View File

@ -42,7 +42,7 @@ def parse_arguments():
parser.add_argument('--output_dir',
type=str,
default='tllm_checkpoint',
help='The path to save the TensorRT-LLM checkpoint')
help='The path to save the TensorRT LLM checkpoint')
parser.add_argument(
'--workers',
type=int,

Some files were not shown because too many files have changed in this diff Show More