perf: customize cublastLt algo for Llamba 3.3 70B TP4 (#6315)

Signed-off-by: Zhenhua Wang <zhenhuaw@nvidia.com>
This commit is contained in:
Zhenhua Wang 2025-07-24 23:01:15 +08:00 committed by GitHub
parent 7b6aadc800
commit 62298bc473
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 4 additions and 1 deletions

View File

@ -29,7 +29,7 @@ CompileFlags:
# Tweak the clangd parse settings for all files # Tweak the clangd parse settings for all files
CompileFlags: CompileFlags:
Compiler: clang++ Compiler: clang++
CompilationDatabase: . CompilationDatabase: cpp/build
Add: Add:
# report all errors # report all errors
- "-ferror-limit=0" - "-ferror-limit=0"

View File

@ -66,6 +66,9 @@ AlgoListType fp8_algo_list = {
{{8, 8192, 8192}, {393, 36, 1, 0, 0, 5, 2}}, {{8, 8192, 8192}, {393, 36, 1, 0, 0, 5, 2}},
// [-algo66 -m_tile10 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga2 -m_scheduling1] // [-algo66 -m_tile10 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga2 -m_scheduling1]
{{8, 8192, 57344}, {10, 36, 1, 0, 0, 1, 2}}, {{8, 8192, 57344}, {10, 36, 1, 0, 0, 1, 2}},
// Llama-3.3-70B TP4 (this is the default algo on B200. Here we aim to use the same algo on GB200.)
// [-algo66 -m_tile393 -m_stages36 -m_numsK1 -m_reduction0 -m_swizzle0 -m_custom1 -m_mma0 -m_cga4 -m_scheduling1]
{{8, 8192, 14336}, {393, 36, 1, 0, 1, 1, 4}},
}; };
void set_algo_attr(cublasLtMatmulAlgo_t& algo, std::array<int, 7> const& attr_list) void set_algo_attr(cublasLtMatmulAlgo_t& algo, std::array<int, 7> const& attr_list)