From 62298bc4730b3b862964521a4b02824a318e6092 Mon Sep 17 00:00:00 2001 From: Zhenhua Wang <4936589+zhenhuaw-me@users.noreply.github.com> Date: Thu, 24 Jul 2025 23:01:15 +0800 Subject: [PATCH] perf: customize cublastLt algo for Llamba 3.3 70B TP4 (#6315) Signed-off-by: Zhenhua Wang --- .clangd | 2 +- cpp/tensorrt_llm/thop/cublasScaledMM.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.clangd b/.clangd index 99f2765a55..c8d6fdda36 100644 --- a/.clangd +++ b/.clangd @@ -29,7 +29,7 @@ CompileFlags: # Tweak the clangd parse settings for all files CompileFlags: Compiler: clang++ - CompilationDatabase: . + CompilationDatabase: cpp/build Add: # report all errors - "-ferror-limit=0" diff --git a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp index ed90c31cf5..d39b7b693f 100644 --- a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp @@ -66,6 +66,9 @@ AlgoListType fp8_algo_list = { {{8, 8192, 8192}, {393, 36, 1, 0, 0, 5, 2}}, // [-algo66 -m_tile10 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga2 -m_scheduling1] {{8, 8192, 57344}, {10, 36, 1, 0, 0, 1, 2}}, + // Llama-3.3-70B TP4 (this is the default algo on B200. Here we aim to use the same algo on GB200.) + // [-algo66 -m_tile393 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga4 -m_scheduling1] + {{8, 8192, 14336}, {393, 36, 1, 0, 1, 1, 4}}, }; void set_algo_attr(cublasLtMatmulAlgo_t& algo, std::array const& attr_list)