mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][chroe] Rename TensorRT-LLM to TensorRT LLM for source code. (#7851)
Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
This commit is contained in:
parent
68b7900a1d
commit
57079cecb3
@ -25,7 +25,7 @@ TensorRT LLM
|
||||
* [08/01] Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)
|
||||
✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
|
||||
|
||||
* [07/26] N-Gram Speculative Decoding in TensorRT‑LLM
|
||||
* [07/26] N-Gram Speculative Decoding in TensorRT LLM
|
||||
✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md)
|
||||
|
||||
* [06/19] Disaggregated Serving in TensorRT LLM
|
||||
|
||||
@ -135,7 +135,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT.");
|
||||
cxxopts::Options options("TensorRT LLM C++ Runtime Benchmark", "TensorRT LLM C++ Runtime Benchmark for BERT.");
|
||||
options.add_options()("h,help", "Print usage");
|
||||
options.add_options()(
|
||||
"m,model", "Model name specified for engines.", cxxopts::value<std::string>()->default_value("bert_base"));
|
||||
|
||||
@ -1145,7 +1145,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,
|
||||
int main(int argc, char* argv[])
|
||||
|
||||
{
|
||||
cxxopts::Options options("TensorRT-LLm DisaggServer Benchmark");
|
||||
cxxopts::Options options("TensorRT LLM DisaggServer Benchmark");
|
||||
options.add_options()("h,help", "Print usage");
|
||||
options.add_options()("context_engine_dirs", "Directories that store context engines,separator is a ,",
|
||||
cxxopts::value<std::vector<std::string>>());
|
||||
|
||||
@ -1055,7 +1055,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
cxxopts::Options options(
|
||||
"TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models.");
|
||||
"TensorRT LLM BatchManager Benchmark", "TensorRT LLM BatchManager Benchmark for GPT and GPT-like models.");
|
||||
options.add_options()("h,help", "Print usage");
|
||||
options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.",
|
||||
cxxopts::value<std::string>());
|
||||
|
||||
@ -217,7 +217,7 @@ std::vector<std::filesystem::path> getJitIncludeDirs()
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_LOG_WARNING("Failed to find TensorRT-LLM installation, DeepGEMM will be disabled.");
|
||||
TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled.");
|
||||
}
|
||||
}
|
||||
return includeDirs;
|
||||
|
||||
@ -165,7 +165,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
|
||||
{
|
||||
void* ret = dllGetSym(handle, name);
|
||||
TLLM_CHECK_WITH_INFO(ret != nullptr,
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
|
||||
"built with UCX support, please rebuild in UCX-enabled environment.");
|
||||
return ret;
|
||||
};
|
||||
|
||||
@ -105,7 +105,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm100(T* D, void const* A, void const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
|
||||
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -146,15 +146,15 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
|
||||
occupancy);
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM100::Undefined:
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
|
||||
"[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -177,7 +177,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm120(T* D, void const* A, void const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
|
||||
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -205,16 +205,16 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm120(T* D, void const* A, void const* B,
|
||||
occupancy);
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM120::Undefined:
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM120::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
|
||||
"[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
|
||||
"[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -257,7 +257,7 @@ size_t dispatchMXFP8xMXFP4GemmClusterShapeSm100(T* D, void const* A, void const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
|
||||
"[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -293,15 +293,15 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
|
||||
occupancy);
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM100::Undefined:
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
|
||||
"[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -338,7 +338,7 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
|
||||
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
|
||||
}
|
||||
}
|
||||
else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4)
|
||||
@ -356,13 +356,13 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
|
||||
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
|
||||
"[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -93,7 +93,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
|
||||
int* occupancy)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture.");
|
||||
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture.");
|
||||
}
|
||||
|
||||
#else
|
||||
@ -250,7 +250,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
|
||||
{
|
||||
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
|
||||
+ std::to_string(mMaxSmemSize);
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
|
||||
}
|
||||
/* // Return workspace size */
|
||||
if (!A && !B && !D)
|
||||
@ -261,28 +261,28 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
|
||||
{
|
||||
std::string errMsg("Requested workspace size insufficient. Required "
|
||||
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
|
||||
}
|
||||
auto can_implement = gemm.can_implement(args);
|
||||
if (can_implement != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "MXFP8xMXFP4 Gemm cutlass kernel will fail for params. Error: "
|
||||
+ std::string(cutlassGetStatusString(can_implement));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
|
||||
}
|
||||
auto initStatus = gemm.initialize(args, workspace, stream);
|
||||
if (initStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "Failed to initialize cutlass MXFP8xMXFP4 gemm. Error: "
|
||||
+ std::string(cutlassGetStatusString(initStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
|
||||
}
|
||||
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());
|
||||
if (runStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg
|
||||
= "Failed to run cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
|
||||
}
|
||||
return gemm.get_workspace_size(args);
|
||||
}
|
||||
|
||||
@ -107,7 +107,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
|
||||
int* occupancy) \
|
||||
{ \
|
||||
throw std::runtime_error( \
|
||||
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
|
||||
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
|
||||
}
|
||||
|
||||
#else
|
||||
@ -268,7 +268,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
|
||||
{ \
|
||||
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
|
||||
+ std::to_string(mMaxSmemSize); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
/* // Return workspace size */ \
|
||||
if (!A && !B && !D) \
|
||||
@ -279,28 +279,28 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
|
||||
{ \
|
||||
std::string errMsg("Requested workspace size insufficient. Required " \
|
||||
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
auto can_implement = gemm.can_implement(args); \
|
||||
if (can_implement != cutlass::Status::kSuccess) \
|
||||
{ \
|
||||
std::string errMsg = "FP4 Gemm cutlass kernel will fail for params. Error: " \
|
||||
+ std::string(cutlassGetStatusString(can_implement)); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
auto initStatus = gemm.initialize(args, workspace, stream); \
|
||||
if (initStatus != cutlass::Status::kSuccess) \
|
||||
{ \
|
||||
std::string errMsg \
|
||||
= "Failed to initialize cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(initStatus)); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); \
|
||||
if (runStatus != cutlass::Status::kSuccess) \
|
||||
{ \
|
||||
std::string errMsg \
|
||||
= "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus)); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
return gemm.get_workspace_size(args); \
|
||||
}
|
||||
|
||||
@ -69,7 +69,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
|
||||
int* occupancy) \
|
||||
{ \
|
||||
throw std::runtime_error( \
|
||||
"[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
|
||||
"[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
|
||||
}
|
||||
|
||||
#else
|
||||
@ -224,7 +224,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
|
||||
{ \
|
||||
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
|
||||
+ std::to_string(mMaxSmemSize); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
/* // Return workspace size */ \
|
||||
if (!A && !B && !D) \
|
||||
@ -235,7 +235,7 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
|
||||
{ \
|
||||
std::string errMsg("Requested workspace size insufficient. Required " \
|
||||
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes)); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
auto initStatus = gemm.initialize(args, workspace); \
|
||||
if (initStatus != cutlass::Status::kSuccess) \
|
||||
@ -243,14 +243,14 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B,
|
||||
auto cudaErrMsg = cudaGetErrorString(cudaGetLastError()); \
|
||||
std::string errMsg = "Failed to initialize cutlass FP4 gemm. Error: " \
|
||||
+ std::string(cutlass::cutlassGetStatusString(initStatus)) + " " + cudaErrMsg; \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL()); \
|
||||
if (runStatus != cutlass::Status::kSuccess) \
|
||||
{ \
|
||||
std::string errMsg \
|
||||
= "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlass::cutlassGetStatusString(runStatus)); \
|
||||
throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg); \
|
||||
} \
|
||||
return gemm.get_workspace_size(args); \
|
||||
}
|
||||
|
||||
@ -75,7 +75,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
|
||||
{
|
||||
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
|
||||
+ std::to_string(mMaxSmemSize);
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
// Return workspace size
|
||||
@ -88,7 +88,7 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
|
||||
{
|
||||
std::string errMsg("Requested workspace size insufficient. Required "
|
||||
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto can_implement = gemm.can_implement(args);
|
||||
@ -96,21 +96,21 @@ size_t typedFp8RowwiseGemmKernelLauncher(Gemm gemm, typename Gemm::Arguments arg
|
||||
{
|
||||
std::string errMsg = "fp8RowwiseGemm cutlass kernel not implemented given the params. Error: "
|
||||
+ std::string(cutlassGetStatusString(can_implement));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto initStatus = gemm.initialize(args, workspace, stream);
|
||||
if (initStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto runStatus = gemm.run(stream);
|
||||
if (runStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fp8RowwiseGemm Runner] " + errMsg);
|
||||
}
|
||||
return gemm.get_workspace_size(args);
|
||||
}
|
||||
@ -210,7 +210,7 @@ size_t dispatchGemmConfigSm89(void* D, void const* A, void const* B, void const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm89] Config is invalid for "
|
||||
"Fp8 Rowwise GEMM.");
|
||||
break;
|
||||
}
|
||||
@ -299,16 +299,16 @@ size_t dispatchGemmToCutlassSm89(void* D, void const* A, void const* B, void con
|
||||
|
||||
case tkc::CutlassTileConfig::Undefined:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined.");
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfig::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] gemm config should have "
|
||||
"already been set by heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm89] Config is invalid for "
|
||||
"Fp8 Rowwise GEMM.");
|
||||
break;
|
||||
}
|
||||
@ -379,7 +379,7 @@ size_t genericFp8RowwiseGemmKernelLauncherSm90(void* D, void const* A, void cons
|
||||
Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
|
||||
#else // COMPILE_HOPPER_TMA_GEMMS
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
|
||||
"[TensorRT LLM Error][Fp8RowwiseGemmKernelLauncherSm90] Please recompile with support for hopper by passing "
|
||||
"90-real as an arch to build_wheel.py.");
|
||||
#endif // COMPILE_HOPPER_TMA_GEMMS
|
||||
}
|
||||
@ -418,7 +418,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmConfigSm90] Config is invalid for "
|
||||
"Fp8 Rowwise GEMM.");
|
||||
break;
|
||||
}
|
||||
@ -468,16 +468,16 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::Undefined:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
|
||||
"already been set by heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for "
|
||||
"Fp8 Rowwise GEMM.");
|
||||
break;
|
||||
}
|
||||
@ -517,7 +517,7 @@ size_t CutlassFp8RowwiseGemmRunner<T>::dispatchToArch(void* D, void const* A, vo
|
||||
#endif
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
|
||||
"Fp8 Rowwise GEMM");
|
||||
}
|
||||
return 0;
|
||||
@ -585,7 +585,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFp8RowwiseGemmRunner<T>::getConfigs()
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
|
||||
"[TensorRT LLM Error][CutlassFp8RowwiseGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS "
|
||||
"Fp8 Rowwise GEMM");
|
||||
}
|
||||
return candidateConfigs;
|
||||
|
||||
@ -209,7 +209,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
|
||||
{
|
||||
std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: "
|
||||
+ std::string(cutlassGetStatusString(can_implement));
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
|
||||
}
|
||||
|
||||
auto init_status = gemm.initialize(args, workspace, stream);
|
||||
@ -217,7 +217,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
|
||||
{
|
||||
std::string err_msg
|
||||
= "Failed to initialize cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(init_status));
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
|
||||
}
|
||||
|
||||
auto run_status = gemm.run(stream);
|
||||
@ -225,7 +225,7 @@ void generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const
|
||||
{
|
||||
std::string err_msg
|
||||
= "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status));
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
@ -247,14 +247,14 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca
|
||||
// Multistage only supported on Ampere
|
||||
std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
|
||||
+ std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
|
||||
throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
|
||||
}
|
||||
else if constexpr (Stages == 2 && arch::kMinComputeCapability >= 89)
|
||||
{
|
||||
// Multistage only supported on Ampere
|
||||
std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
|
||||
+ std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
|
||||
throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
|
||||
}
|
||||
else if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value
|
||||
&& arch::kMinComputeCapability < 89)
|
||||
@ -262,7 +262,7 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B, Sca
|
||||
// FP8 activation type only supported on Ada+ GPUs
|
||||
std::string err_msg = "Cutlass fpA_intB gemm not supported for arch "
|
||||
+ std::to_string(arch::kMinComputeCapability) + " with activation type set to FP8";
|
||||
throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][filter_and_run_mixed_gemm] " + err_msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -301,7 +301,7 @@ void dispatch_gemm_config(ActivationType const* A, WeightType const* B, ScaleZer
|
||||
break;
|
||||
default:
|
||||
std::string err_msg = "dispatch_gemm_config does not support stages " + std::to_string(gemm_config.stages);
|
||||
throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_config] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + err_msg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -370,16 +370,16 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal
|
||||
C, m, n, k, group_size, gemm_config, workspace, workspace_bytes, stream, occupancy);
|
||||
break;
|
||||
case tkc::CutlassTileConfig::Undefined:
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfig::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by "
|
||||
"[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
|
||||
"[TensorRT LLM Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -387,7 +387,7 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, Scal
|
||||
{
|
||||
// This is not a limitation in CUTLASS. We just do not need to support this case.
|
||||
std::string err_msg = "The activation type must equal the scale, bias and output types on Ampere and earlier.";
|
||||
throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_to_cutlass] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_to_cutlass] " + err_msg);
|
||||
}
|
||||
}
|
||||
|
||||
@ -439,7 +439,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
|
||||
if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs "
|
||||
"[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada needs "
|
||||
"CUDA>=12.4");
|
||||
}
|
||||
#endif
|
||||
@ -459,7 +459,7 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for CUTLASS mixed type "
|
||||
"[TensorRT LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for CUTLASS mixed type "
|
||||
"GEMM");
|
||||
}
|
||||
}
|
||||
|
||||
@ -62,7 +62,7 @@ void sm90_dispatch_epilogue_schedules(ActivationType const* A, WeightType const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule config is invalid for "
|
||||
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule config is invalid for "
|
||||
"mixed "
|
||||
"type GEMM.");
|
||||
break;
|
||||
@ -135,7 +135,7 @@ void sm90_dispatch_mainloop_schedules(ActivationType const* A, WeightType const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule config is invalid "
|
||||
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule config is invalid "
|
||||
"for "
|
||||
"mixed type GEMM.");
|
||||
break;
|
||||
@ -144,7 +144,7 @@ void sm90_dispatch_mainloop_schedules(ActivationType const* A, WeightType const*
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and Cluster shapes for "
|
||||
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and Cluster shapes for "
|
||||
"mixed type GEMM.");
|
||||
}
|
||||
}
|
||||
@ -181,7 +181,7 @@ void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B, Sca
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type GEMM.");
|
||||
"[TensorRT LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -254,16 +254,16 @@ void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::Undefined:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
|
||||
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have already been set by "
|
||||
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
|
||||
"[TensorRT LLM Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for mixed type GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -193,7 +193,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
|
||||
if (group_size % cta_shape_k != 0)
|
||||
{
|
||||
std::string err_msg = "The group size must a multiple of " + std::to_string(cta_shape_k);
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner]" + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner]" + err_msg);
|
||||
}
|
||||
|
||||
if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)
|
||||
@ -249,7 +249,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
|
||||
Gemm gemm;
|
||||
if (gemm.get_workspace_size(args) > workspace_bytes)
|
||||
{
|
||||
TLLM_LOG_ERROR("[TensorRT-LLm Error][fpA_intB Runner] given workspace size insufficient.");
|
||||
TLLM_LOG_ERROR("[TensorRT LLM Error][fpA_intB Runner] given workspace size insufficient.");
|
||||
}
|
||||
|
||||
auto can_implement = gemm.can_implement(args);
|
||||
@ -258,7 +258,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
|
||||
std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: "
|
||||
+ std::string(cutlassGetStatusString(can_implement));
|
||||
std::cout << err_msg << std::endl;
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
|
||||
}
|
||||
|
||||
auto init_status = gemm.initialize(args, workspace, stream);
|
||||
@ -266,7 +266,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
|
||||
{
|
||||
std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: "
|
||||
+ std::string(cutlassGetStatusString(init_status));
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
|
||||
}
|
||||
|
||||
auto run_status = gemm.run(stream);
|
||||
@ -274,13 +274,13 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
|
||||
{
|
||||
std::string err_msg
|
||||
= "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status));
|
||||
throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << ","
|
||||
ss << "[TensorRT LLM Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << ","
|
||||
<< (int64_t) cute::size<1>(CTAShape{}) << "," << (int64_t) cute::size<2>(CTAShape{}) << ") ("
|
||||
<< (int64_t) cute::size<0>(ClusterShape{}) << "," << (int64_t) cute::size<1>(ClusterShape{}) << ","
|
||||
<< (int64_t) cute::size<2>(ClusterShape{}) << ") not compiled with FAST_BUILD.";
|
||||
@ -290,7 +290,7 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType
|
||||
|
||||
#else // COMPILE_HOPPER_TMA_GEMMS
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch "
|
||||
"[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an arch "
|
||||
"to build_wheel.py.");
|
||||
#endif // COMPILE_HOPPER_TMA_GEMMS
|
||||
}
|
||||
|
||||
@ -67,7 +67,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
|
||||
{
|
||||
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
|
||||
+ std::to_string(mMaxSmemSize);
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
// Return workspace size
|
||||
@ -80,7 +80,7 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
|
||||
{
|
||||
std::string errMsg("Requested workspace size insufficient. Required "
|
||||
+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto can_implement = gemm.can_implement(args);
|
||||
@ -88,21 +88,21 @@ size_t typedGemmGatedKernelLauncher(Gemm gemm, typename Gemm::Arguments args, vo
|
||||
{
|
||||
std::string errMsg = "fusedGatedGemm cutlass kernel not implemented given the params. Error: "
|
||||
+ std::string(cutlassGetStatusString(can_implement));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto initStatus = gemm.initialize(args, workspace, stream);
|
||||
if (initStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto runStatus = gemm.run(stream);
|
||||
if (runStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner] " + errMsg);
|
||||
}
|
||||
return gemm.get_workspace_size(args);
|
||||
}
|
||||
@ -165,7 +165,7 @@ size_t genericGemmGatedKernelLauncherSm90(void* D, void const* A, void const* B,
|
||||
return typedGemmGatedKernelLauncher(Gemm{}, args, D, A, B, C_bias, workspace, workspaceBytes, stream, occupancy);
|
||||
#else // COMPILE_HOPPER_TMA_GEMMS
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real "
|
||||
"[TensorRT LLM Error][GemmGatedKernelLauncherSm90] Please recompile with support for hopper by passing 90-real "
|
||||
"as an arch to build_wheel.py.");
|
||||
#endif // COMPILE_HOPPER_TMA_GEMMS
|
||||
}
|
||||
@ -204,7 +204,7 @@ size_t dispatchGemmConfigSm90(void* D, void const* A, void const* B, void const*
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused "
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmConfigSm90] Config is invalid for fused "
|
||||
"gated GEMM.");
|
||||
break;
|
||||
}
|
||||
@ -255,17 +255,17 @@ size_t dispatchGemmToCutlassSm90(void* D, void const* A, void const* B, void con
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::Undefined:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] gemm config should have "
|
||||
"already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused "
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][dispatchGemmToCutlassSm90] Config is invalid for fused "
|
||||
"gated GEMM.");
|
||||
break;
|
||||
}
|
||||
@ -302,14 +302,14 @@ size_t CutlassFusedGatedGemmRunner<T>::dispatchToArch(void* D, void const* A, vo
|
||||
#endif
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
|
||||
"gated GEMM");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
|
||||
"gated "
|
||||
"GEMM");
|
||||
}
|
||||
@ -340,7 +340,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFusedGatedGemmRunner<T>::getConfigs()
|
||||
if (mSm != 90)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS fused "
|
||||
"gated GEMM");
|
||||
}
|
||||
tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param
|
||||
@ -378,7 +378,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassFusedGatedGemmRunner<T>::getConfigs()
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
|
||||
"[TensorRT LLM Error][CutlassFusedGatedGemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS fused "
|
||||
"gated "
|
||||
"GEMM");
|
||||
}
|
||||
|
||||
@ -150,7 +150,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
|
||||
{
|
||||
std::string errMsg = "int8gemm cutlass kernel will fail for params. Error: "
|
||||
+ std::string(cutlassGetStatusString(can_implement));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto initStatus = gemm.initialize(args, workspace, stream);
|
||||
@ -158,7 +158,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
|
||||
{
|
||||
std::string errMsg
|
||||
= "Failed to initialize cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(initStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto runStatus = gemm.run(stream);
|
||||
@ -166,7 +166,7 @@ void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMo
|
||||
{
|
||||
std::string errMsg
|
||||
= "Failed to run cutlass int8 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][int8gemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][int8gemm Runner] " + errMsg);
|
||||
}
|
||||
}
|
||||
|
||||
@ -180,7 +180,7 @@ struct dispatchStages
|
||||
TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
|
||||
std::string errMsg = "Cutlass int8 gemm. Not instantiates for arch "
|
||||
+ std::to_string(arch::kMinComputeCapability) + " with stages set to " + std::to_string(Stages);
|
||||
throw std::runtime_error("[TensorRT-LLM Error][dispatchStages::dispatch] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][dispatchStages::dispatch] " + errMsg);
|
||||
}
|
||||
};
|
||||
|
||||
@ -248,7 +248,7 @@ void dispatchGemmConfig(int8_t const* A, int8_t const* B, tk::QuantMode quantOpt
|
||||
break;
|
||||
default:
|
||||
std::string errMsg = "dispatchGemmConfig does not support stages " + std::to_string(gemmConfig.stages);
|
||||
throw std::runtime_error("[TensorRT-LLM Error][dispatch_gemm_config] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][dispatch_gemm_config] " + errMsg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -288,16 +288,16 @@ void dispatchGemmToCutlass(int8_t const* A, int8_t const* B, tk::QuantMode quant
|
||||
quantOption, alphaCol, alphaRow, C, m, n, k, gemmConfig, workspace, workspaceBytes, stream, occupancy);
|
||||
break;
|
||||
case tkc::CutlassTileConfig::Undefined:
|
||||
throw std::runtime_error("[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined.");
|
||||
throw std::runtime_error("[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfig::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by "
|
||||
"[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] gemm config should have already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM.");
|
||||
"[TensorRT LLM Error][int8][dispatch_gemm_to_cutlass] Config is invalid for int8 GEMM.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -342,7 +342,7 @@ void CutlassInt8GemmRunner<T>::dispatchToArch(int8_t const* A, int8_t const* B,
|
||||
else
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
|
||||
"[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
|
||||
}
|
||||
}
|
||||
|
||||
@ -364,7 +364,7 @@ std::vector<tkc::CutlassGemmConfig> CutlassInt8GemmRunner<T>::getConfigs() const
|
||||
if (mSm <= 70)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
|
||||
"[TensorRT LLM Error][CutlassInt8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS int8 GEMM");
|
||||
}
|
||||
|
||||
std::vector<tkc::CutlassGemmConfig> candidateConfigs = get_candidate_configs(mSm, SPLIT_K_LIMIT, config_type_param);
|
||||
|
||||
@ -195,7 +195,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
|
||||
{
|
||||
std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
|
||||
+ std::to_string(mMaxSmemSize);
|
||||
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
// Return workspace size
|
||||
@ -208,7 +208,7 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
|
||||
{
|
||||
std::string errMsg("Requested workspace size insufficient. Required "
|
||||
+ std::to_string(gemm.get_workspace_size(arguments)) + ", got " + std::to_string(workspaceBytes));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto can_implement = gemm.can_implement(arguments);
|
||||
@ -216,26 +216,26 @@ size_t genericFp8LowLatencyGemmKernelLauncherSm90(__nv_fp8_e4m3 const* A, __nv_f
|
||||
{
|
||||
std::string errMsg = "Fp8LowLatencyGemm cutlass kernel not implemented given the params. Error: "
|
||||
+ std::string(cutlassGetStatusString(can_implement));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto initStatus = gemm.initialize(arguments, workspacePtr);
|
||||
if (initStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "Failed to initialize. Error: " + std::string(cutlassGetStatusString(initStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
}
|
||||
|
||||
auto runStatus = gemm.run(stream, nullptr, pdl_overlap_ratio >= 0);
|
||||
if (runStatus != cutlass::Status::kSuccess)
|
||||
{
|
||||
std::string errMsg = "Failed to run gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
|
||||
throw std::runtime_error("[TensorRT-LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
throw std::runtime_error("[TensorRT LLM Error][Fp8LowLatencyGemm Runner] " + errMsg);
|
||||
}
|
||||
return gemm.get_workspace_size(arguments);
|
||||
#else // COMPILE_HOPPER_TMA_GEMMS
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by "
|
||||
"[TensorRT LLM Error][genericFp8LowLatencyGemmKernelLauncherSm90] Please recompile with support for hopper by "
|
||||
"passing 90-real as an arch to build_wheel.py.");
|
||||
#endif // COMPILE_HOPPER_TMA_GEMMS
|
||||
}
|
||||
@ -264,7 +264,7 @@ size_t dispatchLowLatencyGemmCultassKernelSchedSm90(__nv_fp8_e4m3 const* A, __nv
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config "
|
||||
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmCultassKernelSchedSm90] Config "
|
||||
"is "
|
||||
"invalid for low latency fp8 gemm");
|
||||
break;
|
||||
@ -300,7 +300,7 @@ size_t dispatchLowLatencyGemmClusterShapeSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e
|
||||
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is "
|
||||
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmClusterShapeSm90] Config is "
|
||||
"invalid for low latency fp8 gemm");
|
||||
break;
|
||||
}
|
||||
@ -369,19 +369,19 @@ size_t dispatchLowLatencyGemmToCutlassSm90(__nv_fp8_e4m3 const* A, __nv_fp8_e4m3
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::Undefined:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
|
||||
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
|
||||
"undefined.");
|
||||
break;
|
||||
case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
|
||||
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] gemm config "
|
||||
"should have "
|
||||
"already been set by "
|
||||
"heuristic.");
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLm Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is "
|
||||
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][dispatchLowLatencyGemmToCutlassSm90] Config is "
|
||||
"invalid for low latency fp8 gemm");
|
||||
break;
|
||||
}
|
||||
@ -413,7 +413,7 @@ size_t CutlassLowLatencyFp8GemmRunner<T>::dispatchToArch(__nv_fp8_e4m3 const* A,
|
||||
{
|
||||
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low "
|
||||
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] dtype unsupported for CUTLASS Low "
|
||||
"Latency Gemm");
|
||||
}
|
||||
return 0;
|
||||
@ -499,7 +499,7 @@ std::vector<ConfigType> CutlassLowLatencyFp8GemmRunner<T>::getConfigs() const
|
||||
if (mSm != 90)
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"[TensorRT-LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low "
|
||||
"[TensorRT LLM Error][CutlassLowLatencyFp8GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP8 Low "
|
||||
"Latency GEMM");
|
||||
}
|
||||
tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param
|
||||
|
||||
@ -235,12 +235,12 @@ struct BatchedGemmData
|
||||
void const* mPtrBias{nullptr};
|
||||
|
||||
// The output tensor scaling factor for MxFp{4,8}, Fp8 and NvFp4 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [B].
|
||||
float const* mPtrScaleC{nullptr};
|
||||
|
||||
// The output gate scale for MxFp{4,8} and NvFp4 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [B].
|
||||
float const* mPtrScaleGate{nullptr};
|
||||
|
||||
|
||||
@ -214,12 +214,12 @@ struct KernelParams
|
||||
// ScaleC = SEncC
|
||||
//
|
||||
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [B]. One scaling factor per tensor in batch.
|
||||
float const* ptrScaleC{nullptr};
|
||||
|
||||
// The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [B]. One scaling factor per tensor in batch.
|
||||
float const* ptrScaleGate{nullptr};
|
||||
|
||||
|
||||
@ -143,7 +143,7 @@ struct GemmData
|
||||
void const* mPtrPerTokenSfB{nullptr};
|
||||
|
||||
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [1].
|
||||
void* mPtrScaleC{nullptr};
|
||||
};
|
||||
|
||||
@ -204,7 +204,7 @@ struct KernelParams
|
||||
void* ptrSfC;
|
||||
|
||||
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [1].
|
||||
float const* ptrScaleC;
|
||||
|
||||
|
||||
@ -133,11 +133,11 @@ struct GemmGatedActData
|
||||
void const* mPtrPerTokenSfB{nullptr};
|
||||
|
||||
// The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [1].
|
||||
void const* mPtrScaleC{nullptr};
|
||||
// The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [1].
|
||||
void const* mPtrScaleGate{nullptr};
|
||||
};
|
||||
|
||||
@ -290,7 +290,7 @@ struct KernelParams
|
||||
// y = act(ptrScaleGate[0] * y1) * (ptrScaleC[0] * y2)
|
||||
//
|
||||
// The output tensor scaling factor for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
|
||||
// TensorRT-LLM API requires a scaling factor on the device.
|
||||
// TensorRT LLM API requires a scaling factor on the device.
|
||||
// Shape is [1].
|
||||
float const* ptrScaleC;
|
||||
// The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization.
|
||||
|
||||
@ -73,7 +73,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
|
||||
|
||||
NB_MODULE(TRTLLM_NB_MODULE, m)
|
||||
{
|
||||
m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
|
||||
m.doc() = "TensorRT LLM Python bindings for C++ runtime";
|
||||
m.attr("binding_type") = "nanobind";
|
||||
nb::set_leak_warnings(false);
|
||||
|
||||
|
||||
@ -125,7 +125,7 @@ BertAttentionPlugin::BertAttentionPlugin(void const* data, size_t length)
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ CudaStreamPlugin::CudaStreamPlugin(void const* data, size_t length)
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -58,7 +58,7 @@ EagleDecodeDraftTokensPlugin::EagleDecodeDraftTokensPlugin(void const* data, siz
|
||||
read(d, mTopKSampling);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
static_cast<int>(length), static_cast<int>(d - a));
|
||||
}
|
||||
|
||||
@ -52,7 +52,7 @@ EagleSampleAndAcceptDraftTokensPlugin::EagleSampleAndAcceptDraftTokensPlugin(voi
|
||||
read(d, mDtype);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -47,7 +47,7 @@ FusedLayernormPlugin::FusedLayernormPlugin(void const* data, size_t length)
|
||||
read(d, mType);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -203,7 +203,7 @@ static GemmAllReducePluginOptions deserializeOptions(void const*& data, size_t l
|
||||
|
||||
TLLM_CHECK_WITH_INFO(end == begin + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (end - begin));
|
||||
|
||||
|
||||
@ -179,7 +179,7 @@ GemmPlugin::GemmPlugin(void const* data, size_t length, GemmPlugin::PluginProfil
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -183,7 +183,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng
|
||||
}
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
TLLM_CHECK_WITH_INFO((smVersion() >= 80) || (mType != nvinfer1::DataType::kBF16),
|
||||
|
||||
@ -35,7 +35,7 @@ IdentityPlugin::IdentityPlugin(void const* data, size_t length)
|
||||
char const *d = reinterpret_cast<char const*>(data), *a = d;
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -61,7 +61,7 @@ LayernormQuantizationPlugin::LayernormQuantizationPlugin(void const* data, size_
|
||||
read(d, mOutputType);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ LookupPlugin::LookupPlugin(void const* data, size_t length)
|
||||
read(d, mRank);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -78,7 +78,7 @@ LoraPlugin::LoraPlugin(void const* data, size_t length, LoraPlugin::PluginProfil
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -124,7 +124,7 @@ LowLatencyGemmPlugin::LowLatencyGemmPlugin(void const* data, size_t length, Plug
|
||||
mPluginProfiler->deserialize(d, mDims, mGemmId);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -159,7 +159,7 @@ LowLatencyGemmSwigluPlugin::LowLatencyGemmSwigluPlugin(
|
||||
mPluginProfiler->deserialize(d, mDims, mGemmId);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -175,7 +175,7 @@ MixtureOfExpertsPlugin::MixtureOfExpertsPlugin(void const* data, size_t length,
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ AllgatherPlugin::AllgatherPlugin(void const* data, size_t length)
|
||||
}
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -77,7 +77,7 @@ AllreducePlugin::AllreducePlugin(void const* data, size_t length)
|
||||
}
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
check();
|
||||
|
||||
@ -45,7 +45,7 @@ RecvPlugin::RecvPlugin(void const* data, size_t length)
|
||||
read(d, mSrcRank);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ ReduceScatterPlugin::ReduceScatterPlugin(void const* data, size_t length)
|
||||
}
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -46,7 +46,7 @@ SendPlugin::SendPlugin(void const* data, size_t length)
|
||||
read(d, mTgtRank);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -64,7 +64,7 @@ QServeGemmPlugin::QServeGemmPlugin(void const* data, size_t length)
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -51,7 +51,7 @@ QuantizePerTokenPlugin::QuantizePerTokenPlugin(void const* data, size_t length)
|
||||
read(d, mSumPerToken);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -35,7 +35,7 @@ QuantizeTensorPlugin::QuantizeTensorPlugin(void const* data, size_t length)
|
||||
char const *d = reinterpret_cast<char const*>(data), *a = d;
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -41,7 +41,7 @@ QuantizeToFP4Plugin::QuantizeToFP4Plugin(void const* data, size_t length)
|
||||
char const *d = reinterpret_cast<char const*>(data), *a = d;
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -58,7 +58,7 @@ RmsnormQuantizationPlugin::RmsnormQuantizationPlugin(void const* data, size_t le
|
||||
read(d, mOutputType);
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -98,7 +98,7 @@ SmoothQuantGemmPlugin::SmoothQuantGemmPlugin(
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -148,7 +148,7 @@ WeightOnlyGroupwiseQuantMatmulPlugin::WeightOnlyGroupwiseQuantMatmulPlugin(
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -126,7 +126,7 @@ WeightOnlyQuantMatmulPlugin::WeightOnlyQuantMatmulPlugin(
|
||||
|
||||
TLLM_CHECK_WITH_INFO(d == a + length,
|
||||
"Expected length (%d) != real length (%d). This is often "
|
||||
"caused by using different TensorRT-LLM version to build "
|
||||
"caused by using different TensorRT LLM version to build "
|
||||
"engine and run engine.",
|
||||
(int) length, (int) (d - a));
|
||||
}
|
||||
|
||||
@ -67,7 +67,7 @@ tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& con
|
||||
|
||||
PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
|
||||
{
|
||||
m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
|
||||
m.doc() = "TensorRT LLM Python bindings for C++ runtime";
|
||||
m.attr("binding_type") = "pybind";
|
||||
|
||||
// Create MpiComm binding first since it's used in the executor bindings
|
||||
|
||||
@ -56,7 +56,7 @@ public:
|
||||
}
|
||||
|
||||
/// @brief If multiple TensorRT optimization profiles are built in the engine, this function selects the
|
||||
/// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT-LLM only split
|
||||
/// corresponding profile that is going to be used based on the runtime shape, for now, TensorRT LLM only split
|
||||
/// multiple profiles on the num_tokens dimension, hence the profile index is selected based on which profile
|
||||
/// handles the actual num_tokens
|
||||
/// @return The index of the selected TensorRT optimization profile
|
||||
|
||||
@ -330,7 +330,7 @@ protected:
|
||||
{
|
||||
void* ret = dllGetSym(handle, name);
|
||||
TLLM_CHECK_WITH_INFO(ret != nullptr,
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
|
||||
"built with UCX support, please rebuild in UCX-enabled environment.");
|
||||
return ret;
|
||||
};
|
||||
@ -732,7 +732,7 @@ protected:
|
||||
{
|
||||
void* ret = dllGetSym(handle, name);
|
||||
TLLM_CHECK_WITH_INFO(ret != nullptr,
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
|
||||
"built with UCX support, please rebuild in UCX-enabled environment.");
|
||||
return ret;
|
||||
};
|
||||
|
||||
@ -70,7 +70,7 @@ std::unique_ptr<texec::kv_cache::ConnectionManager> makeOneUcxConnectionManager(
|
||||
void* ret = dllGetSym(handle, name);
|
||||
|
||||
TLLM_CHECK_WITH_INFO(ret != nullptr,
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
|
||||
"Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
|
||||
"built with UCX support, please rebuild in UCX-enabled environment.");
|
||||
return ret;
|
||||
};
|
||||
|
||||
@ -243,7 +243,7 @@ Result run(std::string description, Options& options, Buffers& buffers)
|
||||
auto can_implement = device_gemm.can_implement(arguments);
|
||||
if (can_implement != cutlass::Status::kSuccess)
|
||||
{
|
||||
throw std::runtime_error("[TensorRT-LLM Error][fusedGatedGemm Runner]");
|
||||
throw std::runtime_error("[TensorRT LLM Error][fusedGatedGemm Runner]");
|
||||
}
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
@ -481,7 +481,7 @@ int main(int argc, char const** argv)
|
||||
#ifdef COMPILE_HOPPER_TMA_GEMMS
|
||||
Result hopperFp8 = run<Gemm>(std::string("Hopper fp8 swiglu"), options, buffers);
|
||||
#else // COMPILE_HOPPER_TMA_GEMMS
|
||||
std::cout << "[TensorRT-LLm Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by "
|
||||
std::cout << "[TensorRT LLM Error][GemmSwigluKernelTestSm90Fp8] Please recompile with support for hopper by "
|
||||
"passing 90-real as an arch to build_wheel.py."
|
||||
<< std::endl;
|
||||
#endif // COMPILE_HOPPER_TMA_GEMMS
|
||||
|
||||
@ -338,7 +338,7 @@ TEST(GemmSwigluRunner, Sm90FP8)
|
||||
Result hopperFp8 = run("SM90 FP8 WS GEMM", options, buffers);
|
||||
EXPECT_TRUE(hopperFp8.passed);
|
||||
#else // COMPILE_HOPPER_TMA_GEMMS
|
||||
std::cout << "[TensorRT-LLm Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing "
|
||||
std::cout << "[TensorRT LLM Error][GemmSwigluRunnerTest] Please recompile with support for hopper by passing "
|
||||
"90-real as an arch to build_wheel.py."
|
||||
<< std::endl;
|
||||
#endif // COMPILE_HOPPER_TMA_GEMMS
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
"""
|
||||
NOTE: This FastAPI-based server is only an example for demonstrating the usage
|
||||
of TensorRT-LLM LLM API. It is not intended for production use.
|
||||
of TensorRT LLM LLM API. It is not intended for production use.
|
||||
For production, use the `trtllm-serve` command. The server exposes OpenAI compatible API endpoints.
|
||||
"""
|
||||
|
||||
|
||||
@ -28,11 +28,11 @@ int main(int argc, char* argv[])
|
||||
void log(nvinfer1::ILogger::Severity severity, char const* msg) noexcept override
|
||||
{
|
||||
if (severity <= nvinfer1::ILogger::Severity::kERROR)
|
||||
std::cerr << "[TensorRT-LLM ERR]: " << msg << std::endl;
|
||||
std::cerr << "[TensorRT LLM ERR]: " << msg << std::endl;
|
||||
else if (severity == nvinfer1::ILogger::Severity::kWARNING)
|
||||
std::cerr << "[TensorRT-LLM WARNING]: " << msg << std::endl;
|
||||
std::cerr << "[TensorRT LLM WARNING]: " << msg << std::endl;
|
||||
else
|
||||
std::cout << "[TensorRT-LLM LOG]: " << msg << std::endl;
|
||||
std::cout << "[TensorRT LLM LOG]: " << msg << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -144,7 +144,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -12,7 +12,7 @@ def parse_arguments():
|
||||
'--output_path',
|
||||
type=str,
|
||||
default='config.json',
|
||||
help='The path to save the TensorRT-LLM checkpoint config.json file')
|
||||
help='The path to save the TensorRT LLM checkpoint config.json file')
|
||||
parser.add_argument('--architecture', type=str, default='GPTForCausalLM')
|
||||
parser.add_argument('--dtype',
|
||||
type=str,
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
# MOUNT_DIR: the directory to mount in the container
|
||||
# MOUNT_DEST: the destination directory in the container
|
||||
# WORKDIR: the working directory in the container
|
||||
# SOURCE_ROOT: the path to the TensorRT-LLM source
|
||||
# SOURCE_ROOT: the path to the TensorRT LLM source
|
||||
# PROLOGUE: the prologue to run before the script
|
||||
# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
|
||||
# not supported in Slurm mode, you need to download the model and put it in
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
# MOUNT_DIR: the directory to mount in the container
|
||||
# MOUNT_DEST: the destination directory in the container
|
||||
# WORKDIR: the working directory in the container
|
||||
# SOURCE_ROOT: the path to the TensorRT-LLM source
|
||||
# SOURCE_ROOT: the path to the TensorRT LLM source
|
||||
# PROLOGUE: the prologue to run before the script
|
||||
# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
|
||||
# not supported in Slurm mode, you need to download the model and put it in
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
# MOUNT_DIR: the directory to mount in the container
|
||||
# MOUNT_DEST: the destination directory in the container
|
||||
# WORKDIR: the working directory in the container
|
||||
# SOURCE_ROOT: the path to the TensorRT-LLM source
|
||||
# SOURCE_ROOT: the path to the TensorRT LLM source
|
||||
# PROLOGUE: the prologue to run before the script
|
||||
# LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is
|
||||
# not supported in Slurm mode, you need to download the model and put it in
|
||||
|
||||
@ -161,7 +161,7 @@ def demonstrate_with_logprobs(prompt: str):
|
||||
|
||||
def run_all_demonstrations(model_path: Optional[str] = None):
|
||||
"""Run all sampling demonstrations."""
|
||||
print("🚀 TensorRT-LLM Sampling Techniques Showcase")
|
||||
print("🚀 TensorRT LLM Sampling Techniques Showcase")
|
||||
print("=" * 50)
|
||||
|
||||
# Use the first prompt for most demonstrations
|
||||
|
||||
@ -161,7 +161,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -53,7 +53,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -156,7 +156,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=Path,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--calib_dataset',
|
||||
type=str,
|
||||
|
||||
@ -190,7 +190,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -90,7 +90,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -79,7 +79,7 @@ def parse_arguments():
|
||||
type=str,
|
||||
default='trtllm_checkpoint',
|
||||
required=True,
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -79,7 +79,7 @@ def parse_arguments():
|
||||
type=str,
|
||||
default='trtllm_checkpoint',
|
||||
required=True,
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -87,7 +87,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument('--input_size',
|
||||
type=int,
|
||||
default=64,
|
||||
|
||||
@ -74,7 +74,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -61,7 +61,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -76,7 +76,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -110,7 +110,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -37,7 +37,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -124,7 +124,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -76,7 +76,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -44,7 +44,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument('--caption_channels',
|
||||
type=int,
|
||||
default=4096,
|
||||
|
||||
@ -47,7 +47,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -79,7 +79,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -260,7 +260,7 @@ def main() -> None:
|
||||
trt_llm_config.query_pre_attn_scalar = ckpt_config.query_pre_attn_scalar
|
||||
|
||||
trt_llm_config_dict = trt_llm_config.to_dict()
|
||||
print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}")
|
||||
print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}")
|
||||
|
||||
save_config(trt_llm_config, output_dir=args.output_model_dir, log=True)
|
||||
|
||||
|
||||
@ -127,7 +127,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -132,7 +132,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -71,7 +71,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -227,7 +227,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -51,7 +51,7 @@ def parse_args():
|
||||
'--max_input_len',
|
||||
type=int,
|
||||
default=6400,
|
||||
help='The max input length TensorRT-LLM engine was built with')
|
||||
help='The max input length TensorRT LLM engine was built with')
|
||||
parser.add_argument('--log_level', type=str, default='info')
|
||||
parser.add_argument('--max_ite', type=int, default=5)
|
||||
parser.add_argument(
|
||||
@ -392,7 +392,7 @@ def main(args):
|
||||
references=[hf_summary[ite][beam_idx][batch_idx]])
|
||||
|
||||
for beam_idx in range(args.num_beams):
|
||||
logger.info(f"TensorRT-LLM beam {beam_idx} result")
|
||||
logger.info(f"TensorRT LLM beam {beam_idx} result")
|
||||
computed_metrics_tensorrt_llm = metric_tensorrt_llm[
|
||||
beam_idx].compute()
|
||||
for key in computed_metrics_tensorrt_llm.keys():
|
||||
|
||||
@ -59,7 +59,7 @@ def parse_arguments():
|
||||
'--output_dir',
|
||||
type=Path,
|
||||
default='mamba_tllm_checkpoint',
|
||||
help='The path to save the mamba TensorRT-LLM checkpoint')
|
||||
help='The path to save the mamba TensorRT LLM checkpoint')
|
||||
parser.add_argument('--log_level', type=str, default='info')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
|
||||
@ -192,7 +192,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -132,11 +132,11 @@ def load_hf_model(args):
|
||||
|
||||
|
||||
def load_trtllm_model(args):
|
||||
profiler.start('load TensorRT-LLM model')
|
||||
profiler.start('load TensorRT LLM model')
|
||||
trtllm_model = MultimodalModelRunner(args)
|
||||
profiler.stop('load TensorRT-LLM model')
|
||||
profiler.stop('load TensorRT LLM model')
|
||||
logger.info(
|
||||
f'Load TensorRT-LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT-LLM model")} sec'
|
||||
f'Load TensorRT LLM model takes: {profiler.elapsed_time_in_sec("load TensorRT LLM model")} sec'
|
||||
)
|
||||
return trtllm_model
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -81,7 +81,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -137,7 +137,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
@ -316,7 +316,7 @@ class QWenInfer(object):
|
||||
stream.cuda_stream)
|
||||
stream.synchronize()
|
||||
audio_time = profiler.stop("Audio") / run_time
|
||||
logger.info(f"TensorRT-LLM Audio latency: {audio_time:3f} sec ")
|
||||
logger.info(f"TensorRT LLM Audio latency: {audio_time:3f} sec ")
|
||||
|
||||
assert ok, "Runtime execution failed for audio session"
|
||||
|
||||
@ -567,7 +567,7 @@ class QWenInfer(object):
|
||||
print(f'Output(beam: {beam}): "{output_text}"')
|
||||
logger.info(f"Input length={input_lengths[b]}")
|
||||
logger.info(f"Output length={output_ids.shape}")
|
||||
logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ")
|
||||
logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ")
|
||||
if isinstance(history, list):
|
||||
history.append({'role': 'assistant', 'content': output_text})
|
||||
return output_text, past_audio_features
|
||||
|
||||
@ -418,7 +418,7 @@ class QWenInfer(object):
|
||||
print(f'Output(beam: {beam}): "{output_text}"')
|
||||
logger.info(f"Input length={input_lengths[b]}")
|
||||
logger.info(f"Output length={output_ids.shape}")
|
||||
logger.info(f"TensorRT-LLM QWen time: {Qwen_time:3f} sec ")
|
||||
logger.info(f"TensorRT LLM QWen time: {Qwen_time:3f} sec ")
|
||||
history.append((query, output_text))
|
||||
return output_text
|
||||
|
||||
@ -516,7 +516,7 @@ def vit_process(image_path, vit_engine_path, stream):
|
||||
ok = session_vit.run(visual_inputs, visual_outputs, stream)
|
||||
profiler.stop("ViT")
|
||||
Vit_time = profiler.elapsed_time_in_sec("ViT") / run_time
|
||||
logger.info(f"TensorRT-LLM ViT latency: {Vit_time:3f} sec ")
|
||||
logger.info(f"TensorRT LLM ViT latency: {Vit_time:3f} sec ")
|
||||
|
||||
assert ok, "Runtime execution failed for vit session"
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ def parse_arguments():
|
||||
"--output_dir",
|
||||
type=Path,
|
||||
default="recurrentgemma_tllm_checkpoint",
|
||||
help="The path to save the recurrentgemma TensorRT-LLM checkpoint")
|
||||
help="The path to save the recurrentgemma TensorRT LLM checkpoint")
|
||||
parser.add_argument("--log_level", type=str, default="info")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
@ -506,11 +506,11 @@ def main():
|
||||
)
|
||||
|
||||
trt_llm_config_dict = trt_llm_config.to_dict()
|
||||
print(f"Determined TensorRT-LLM configuration {trt_llm_config_dict}")
|
||||
print(f"Determined TensorRT LLM configuration {trt_llm_config_dict}")
|
||||
|
||||
config_path = args.output_dir / "config.json"
|
||||
config_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
LOGGER.debug(f"Saving TensorRT-LLM configuration to {config_path}")
|
||||
LOGGER.debug(f"Saving TensorRT LLM configuration to {config_path}")
|
||||
with config_path.open("w") as config_file:
|
||||
json.dump(trt_llm_config_dict, config_file, indent=4)
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ def parse_arguments():
|
||||
parser.add_argument('--output_dir',
|
||||
type=str,
|
||||
default='tllm_checkpoint',
|
||||
help='The path to save the TensorRT-LLM checkpoint')
|
||||
help='The path to save the TensorRT LLM checkpoint')
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user